import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
%matplotlib inline
orders_df = pd.read_csv('olist_orders_dataset.csv' \
, parse_dates = ['order_purchase_timestamp' \
, 'order_approved_at', 'order_delivered_carrier_date' \
, 'order_delivered_customer_date', 'order_estimated_delivery_date'])
orders_df.head(5)
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | |
|---|---|---|---|---|---|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 |
| 1 | 53cdb2fc8bc7dce0b6741e2150273451 | b0830fb4747a6c6d20dea0b8c802d7ef | delivered | 2018-07-24 20:41:37 | 2018-07-26 03:24:27 | 2018-07-26 14:31:00 | 2018-08-07 15:27:45 | 2018-08-13 |
| 2 | 47770eb9100c2d0c44946d9cf07ec65d | 41ce2a54c0b03bf3443c3d931a367089 | delivered | 2018-08-08 08:38:49 | 2018-08-08 08:55:23 | 2018-08-08 13:50:00 | 2018-08-17 18:06:29 | 2018-09-04 |
| 3 | 949d5b44dbf5de918fe9c16f97b45f8a | f88197465ea7920adcdbec7375364d82 | delivered | 2017-11-18 19:28:06 | 2017-11-18 19:45:59 | 2017-11-22 13:39:59 | 2017-12-02 00:28:42 | 2017-12-15 |
| 4 | ad21c59c0840e6cb83a9ceb5573f8159 | 8ab97904e6daea8866dbdbc4fb7aad2c | delivered | 2018-02-13 21:18:39 | 2018-02-13 22:20:29 | 2018-02-14 19:46:34 | 2018-02-16 18:17:02 | 2018-02-26 |
orders_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 99441 entries, 0 to 99440 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 order_id 99441 non-null object 1 customer_id 99441 non-null object 2 order_status 99441 non-null object 3 order_purchase_timestamp 99441 non-null datetime64[ns] 4 order_approved_at 99281 non-null datetime64[ns] 5 order_delivered_carrier_date 97658 non-null datetime64[ns] 6 order_delivered_customer_date 96476 non-null datetime64[ns] 7 order_estimated_delivery_date 99441 non-null datetime64[ns] dtypes: datetime64[ns](5), object(3) memory usage: 6.1+ MB
orders_df.describe()
C:\Users\79307\anaconda3\envs\python_3_7\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Treating datetime data as categorical rather than numeric in `.describe` is deprecated and will be removed in a future version of pandas. Specify `datetime_is_numeric=True` to silence this warning and adopt the future behavior now. """Entry point for launching an IPython kernel.
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | |
|---|---|---|---|---|---|---|---|---|
| count | 99441 | 99441 | 99441 | 99441 | 99281 | 97658 | 96476 | 99441 |
| unique | 99441 | 99441 | 8 | 98875 | 90733 | 81018 | 95664 | 459 |
| top | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2018-04-11 10:48:14 | 2018-02-27 04:31:10 | 2018-05-09 15:48:00 | 2018-05-08 23:38:46 | 2017-12-20 00:00:00 |
| freq | 1 | 1 | 96478 | 3 | 9 | 47 | 3 | 522 |
| first | NaN | NaN | NaN | 2016-09-04 21:15:19 | 2016-09-15 12:16:38 | 2016-10-08 10:34:01 | 2016-10-11 13:46:32 | 2016-09-30 00:00:00 |
| last | NaN | NaN | NaN | 2018-10-17 17:30:18 | 2018-09-03 17:40:06 | 2018-09-11 19:48:28 | 2018-10-17 13:22:46 | 2018-11-12 00:00:00 |
orders_df.drop(['order_id', 'customer_id'], axis=1).duplicated().sum()
0
'''дата оплаты позже даты передачи перевозчику'''
orders_df[orders_df['order_approved_at'] >= orders_df['order_delivered_carrier_date']]
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | |
|---|---|---|---|---|---|---|---|---|
| 15 | dcb36b511fcac050b97cd5c05de84dc3 | 3b6828a50ffe546942b7a473d70ac0fc | delivered | 2018-06-07 19:03:12 | 2018-06-12 23:31:02 | 2018-06-11 14:54:00 | 2018-06-21 15:34:32 | 2018-07-04 |
| 64 | 688052146432ef8253587b930b01a06d | 81e08b08e5ed4472008030d70327c71f | delivered | 2018-04-22 08:48:13 | 2018-04-24 18:25:22 | 2018-04-23 19:19:14 | 2018-04-24 19:31:58 | 2018-05-15 |
| 199 | 58d4c4747ee059eeeb865b349b41f53a | 1755fad7863475346bc6c3773fe055d3 | delivered | 2018-07-21 12:49:32 | 2018-07-26 23:31:53 | 2018-07-24 12:57:00 | 2018-07-25 23:58:19 | 2018-07-31 |
| 210 | 412fccb2b44a99b36714bca3fef8ad7b | c6865c523687cb3f235aa599afef1710 | delivered | 2018-07-22 22:30:05 | 2018-07-23 12:31:53 | 2018-07-23 12:24:00 | 2018-07-24 19:26:42 | 2018-07-31 |
| 415 | 56a4ac10a4a8f2ba7693523bb439eede | 78438ba6ace7d2cb023dbbc81b083562 | delivered | 2018-07-22 13:04:47 | 2018-07-27 23:31:09 | 2018-07-24 14:03:00 | 2018-07-28 00:05:39 | 2018-08-06 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 99091 | 240ead1a7284667e0ec71d01f80e4d5e | fcdd7556401aaa1c980f8b67a69f95dc | delivered | 2018-07-02 16:30:02 | 2018-07-05 16:17:59 | 2018-07-05 14:11:00 | 2018-07-10 23:21:47 | 2018-07-24 |
| 99230 | 78008d03bd8ef7fcf1568728b316553c | 043e3254e68daf7256bda1c9c03c2286 | delivered | 2018-07-03 13:11:13 | 2018-07-05 16:32:52 | 2018-07-03 12:57:00 | 2018-07-10 17:47:39 | 2018-07-23 |
| 99266 | 76a948cd55bf22799753720d4545dd2d | 3f20a07b28aa252d0502fe7f7eb030a9 | delivered | 2018-01-30 02:41:30 | 2018-02-04 23:31:46 | 2018-01-31 18:11:58 | 2018-03-18 20:08:50 | 2018-03-02 |
| 99377 | a6bd1f93b7ff72cc348ca07f38ec4bee | 6d63fa86bd2f62908ad328325799152f | delivered | 2018-04-20 17:28:40 | 2018-04-24 19:26:10 | 2018-04-23 17:18:40 | 2018-04-28 17:38:42 | 2018-05-15 |
| 99406 | 7fd85cb0143de098a4c5ab5a57bfbd91 | d32034dfc685b1ae15dd4c78eace868e | delivered | 2017-05-04 21:05:28 | 2017-05-09 22:33:28 | 2017-05-09 17:02:58 | 2017-05-18 06:34:38 | 2017-06-01 |
1359 rows × 8 columns
orders_df.query('order_status == "canceled"').info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 625 entries, 397 to 99347 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 order_id 625 non-null object 1 customer_id 625 non-null object 2 order_status 625 non-null object 3 order_purchase_timestamp 625 non-null datetime64[ns] 4 order_approved_at 484 non-null datetime64[ns] 5 order_delivered_carrier_date 75 non-null datetime64[ns] 6 order_delivered_customer_date 6 non-null datetime64[ns] 7 order_estimated_delivery_date 625 non-null datetime64[ns] dtypes: datetime64[ns](5), object(3) memory usage: 43.9+ KB
'''создадим список не оплаченных отмененных заказов, чтобы затем удалить их из датафрейма'''
not_paid_canceled = orders_df.fillna(0) \
.query("order_status == 'canceled' and order_approved_at == 0") \
.order_id.to_list()
'''убеждаемся, что все верно, должно быть 141 значение (625 - 484)'''
len(not_paid_canceled)
141
'''удаляем неоплаченные отмененные заказы из датафрейма'''
orders_without_canceled = orders_df[~orders_df['order_id'].isin(not_paid_canceled)]
orders_without_canceled.query("order_status == 'delivered'").info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 96478 entries, 0 to 99440 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 order_id 96478 non-null object 1 customer_id 96478 non-null object 2 order_status 96478 non-null object 3 order_purchase_timestamp 96478 non-null datetime64[ns] 4 order_approved_at 96464 non-null datetime64[ns] 5 order_delivered_carrier_date 96476 non-null datetime64[ns] 6 order_delivered_customer_date 96470 non-null datetime64[ns] 7 order_estimated_delivery_date 96478 non-null datetime64[ns] dtypes: datetime64[ns](5), object(3) memory usage: 6.6+ MB
orders_without_canceled.query("order_status == 'delivered'")
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | |
|---|---|---|---|---|---|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 |
| 1 | 53cdb2fc8bc7dce0b6741e2150273451 | b0830fb4747a6c6d20dea0b8c802d7ef | delivered | 2018-07-24 20:41:37 | 2018-07-26 03:24:27 | 2018-07-26 14:31:00 | 2018-08-07 15:27:45 | 2018-08-13 |
| 2 | 47770eb9100c2d0c44946d9cf07ec65d | 41ce2a54c0b03bf3443c3d931a367089 | delivered | 2018-08-08 08:38:49 | 2018-08-08 08:55:23 | 2018-08-08 13:50:00 | 2018-08-17 18:06:29 | 2018-09-04 |
| 3 | 949d5b44dbf5de918fe9c16f97b45f8a | f88197465ea7920adcdbec7375364d82 | delivered | 2017-11-18 19:28:06 | 2017-11-18 19:45:59 | 2017-11-22 13:39:59 | 2017-12-02 00:28:42 | 2017-12-15 |
| 4 | ad21c59c0840e6cb83a9ceb5573f8159 | 8ab97904e6daea8866dbdbc4fb7aad2c | delivered | 2018-02-13 21:18:39 | 2018-02-13 22:20:29 | 2018-02-14 19:46:34 | 2018-02-16 18:17:02 | 2018-02-26 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 99436 | 9c5dedf39a927c1b2549525ed64a053c | 39bd1228ee8140590ac3aca26f2dfe00 | delivered | 2017-03-09 09:54:05 | 2017-03-09 09:54:05 | 2017-03-10 11:18:03 | 2017-03-17 15:08:01 | 2017-03-28 |
| 99437 | 63943bddc261676b46f01ca7ac2f7bd8 | 1fca14ff2861355f6e5f14306ff977a7 | delivered | 2018-02-06 12:58:58 | 2018-02-06 13:10:37 | 2018-02-07 23:22:42 | 2018-02-28 17:37:56 | 2018-03-02 |
| 99438 | 83c1379a015df1e13d02aae0204711ab | 1aa71eb042121263aafbe80c1b562c9c | delivered | 2017-08-27 14:46:43 | 2017-08-27 15:04:16 | 2017-08-28 20:52:26 | 2017-09-21 11:24:17 | 2017-09-27 |
| 99439 | 11c177c8e97725db2631073c19f07b62 | b331b74b18dc79bcdf6532d51e1637c1 | delivered | 2018-01-08 21:28:27 | 2018-01-08 21:36:21 | 2018-01-12 15:35:03 | 2018-01-25 23:32:54 | 2018-02-15 |
| 99440 | 66dea50a8b16d9b4dee7af250b4be1a5 | edb027a75a1449115f6b43211ae02a24 | delivered | 2018-03-08 20:57:30 | 2018-03-09 11:20:28 | 2018-03-09 22:11:59 | 2018-03-16 13:08:30 | 2018-04-03 |
96478 rows × 8 columns
'''создадим список с номерами заказов, которые хотим удалить'''
bug_delivered = orders_without_canceled.fillna(0) \
.query("order_status == 'delivered' and (order_approved_at == 0 or order_delivered_carrier_date == 0 or order_delivered_customer_date == 0)") \
.order_id.to_list()
len(bug_delivered)
23
'''удаляем сомнительные 'доставленные' заказы из датафрейма'''
orders_without_canceled_bugs = orders_without_canceled[~orders_without_canceled['order_id'].isin(bug_delivered)]
orders_without_canceled_bugs
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | |
|---|---|---|---|---|---|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 |
| 1 | 53cdb2fc8bc7dce0b6741e2150273451 | b0830fb4747a6c6d20dea0b8c802d7ef | delivered | 2018-07-24 20:41:37 | 2018-07-26 03:24:27 | 2018-07-26 14:31:00 | 2018-08-07 15:27:45 | 2018-08-13 |
| 2 | 47770eb9100c2d0c44946d9cf07ec65d | 41ce2a54c0b03bf3443c3d931a367089 | delivered | 2018-08-08 08:38:49 | 2018-08-08 08:55:23 | 2018-08-08 13:50:00 | 2018-08-17 18:06:29 | 2018-09-04 |
| 3 | 949d5b44dbf5de918fe9c16f97b45f8a | f88197465ea7920adcdbec7375364d82 | delivered | 2017-11-18 19:28:06 | 2017-11-18 19:45:59 | 2017-11-22 13:39:59 | 2017-12-02 00:28:42 | 2017-12-15 |
| 4 | ad21c59c0840e6cb83a9ceb5573f8159 | 8ab97904e6daea8866dbdbc4fb7aad2c | delivered | 2018-02-13 21:18:39 | 2018-02-13 22:20:29 | 2018-02-14 19:46:34 | 2018-02-16 18:17:02 | 2018-02-26 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 99436 | 9c5dedf39a927c1b2549525ed64a053c | 39bd1228ee8140590ac3aca26f2dfe00 | delivered | 2017-03-09 09:54:05 | 2017-03-09 09:54:05 | 2017-03-10 11:18:03 | 2017-03-17 15:08:01 | 2017-03-28 |
| 99437 | 63943bddc261676b46f01ca7ac2f7bd8 | 1fca14ff2861355f6e5f14306ff977a7 | delivered | 2018-02-06 12:58:58 | 2018-02-06 13:10:37 | 2018-02-07 23:22:42 | 2018-02-28 17:37:56 | 2018-03-02 |
| 99438 | 83c1379a015df1e13d02aae0204711ab | 1aa71eb042121263aafbe80c1b562c9c | delivered | 2017-08-27 14:46:43 | 2017-08-27 15:04:16 | 2017-08-28 20:52:26 | 2017-09-21 11:24:17 | 2017-09-27 |
| 99439 | 11c177c8e97725db2631073c19f07b62 | b331b74b18dc79bcdf6532d51e1637c1 | delivered | 2018-01-08 21:28:27 | 2018-01-08 21:36:21 | 2018-01-12 15:35:03 | 2018-01-25 23:32:54 | 2018-02-15 |
| 99440 | 66dea50a8b16d9b4dee7af250b4be1a5 | edb027a75a1449115f6b43211ae02a24 | delivered | 2018-03-08 20:57:30 | 2018-03-09 11:20:28 | 2018-03-09 22:11:59 | 2018-03-16 13:08:30 | 2018-04-03 |
99277 rows × 8 columns
'''посмотрим на оставшиеся статусы заказов'''
orders_without_canceled_bugs.order_status.unique()
array(['delivered', 'invoiced', 'shipped', 'processing', 'unavailable',
'canceled', 'created', 'approved'], dtype=object)
orders_without_canceled_bugs.query("order_status == 'invoiced'").info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 314 entries, 6 to 99066 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 order_id 314 non-null object 1 customer_id 314 non-null object 2 order_status 314 non-null object 3 order_purchase_timestamp 314 non-null datetime64[ns] 4 order_approved_at 314 non-null datetime64[ns] 5 order_delivered_carrier_date 0 non-null datetime64[ns] 6 order_delivered_customer_date 0 non-null datetime64[ns] 7 order_estimated_delivery_date 314 non-null datetime64[ns] dtypes: datetime64[ns](5), object(3) memory usage: 22.1+ KB
orders_without_canceled_bugs.query("order_status == 'shipped'").info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1107 entries, 44 to 99181 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 order_id 1107 non-null object 1 customer_id 1107 non-null object 2 order_status 1107 non-null object 3 order_purchase_timestamp 1107 non-null datetime64[ns] 4 order_approved_at 1107 non-null datetime64[ns] 5 order_delivered_carrier_date 1107 non-null datetime64[ns] 6 order_delivered_customer_date 0 non-null datetime64[ns] 7 order_estimated_delivery_date 1107 non-null datetime64[ns] dtypes: datetime64[ns](5), object(3) memory usage: 77.8+ KB
orders_without_canceled_bugs.query("order_status == 'processing'").info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 301 entries, 128 to 99313 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 order_id 301 non-null object 1 customer_id 301 non-null object 2 order_status 301 non-null object 3 order_purchase_timestamp 301 non-null datetime64[ns] 4 order_approved_at 301 non-null datetime64[ns] 5 order_delivered_carrier_date 0 non-null datetime64[ns] 6 order_delivered_customer_date 0 non-null datetime64[ns] 7 order_estimated_delivery_date 301 non-null datetime64[ns] dtypes: datetime64[ns](5), object(3) memory usage: 21.2+ KB
orders_without_canceled_bugs.query("order_status == 'created'").info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 5 entries, 7434 to 58958 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 order_id 5 non-null object 1 customer_id 5 non-null object 2 order_status 5 non-null object 3 order_purchase_timestamp 5 non-null datetime64[ns] 4 order_approved_at 0 non-null datetime64[ns] 5 order_delivered_carrier_date 0 non-null datetime64[ns] 6 order_delivered_customer_date 0 non-null datetime64[ns] 7 order_estimated_delivery_date 5 non-null datetime64[ns] dtypes: datetime64[ns](5), object(3) memory usage: 360.0+ bytes
orders_without_canceled_bugs.query("order_status == 'approved'").info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2 entries, 44897 to 88457 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 order_id 2 non-null object 1 customer_id 2 non-null object 2 order_status 2 non-null object 3 order_purchase_timestamp 2 non-null datetime64[ns] 4 order_approved_at 2 non-null datetime64[ns] 5 order_delivered_carrier_date 0 non-null datetime64[ns] 6 order_delivered_customer_date 0 non-null datetime64[ns] 7 order_estimated_delivery_date 2 non-null datetime64[ns] dtypes: datetime64[ns](5), object(3) memory usage: 144.0+ bytes
'''убираем только созданные заказы'''
orders_final = orders_without_canceled_bugs.query('order_status != "created"')
'''наш итоговый датафрейм выглядит так:'''
orders_final
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | |
|---|---|---|---|---|---|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 |
| 1 | 53cdb2fc8bc7dce0b6741e2150273451 | b0830fb4747a6c6d20dea0b8c802d7ef | delivered | 2018-07-24 20:41:37 | 2018-07-26 03:24:27 | 2018-07-26 14:31:00 | 2018-08-07 15:27:45 | 2018-08-13 |
| 2 | 47770eb9100c2d0c44946d9cf07ec65d | 41ce2a54c0b03bf3443c3d931a367089 | delivered | 2018-08-08 08:38:49 | 2018-08-08 08:55:23 | 2018-08-08 13:50:00 | 2018-08-17 18:06:29 | 2018-09-04 |
| 3 | 949d5b44dbf5de918fe9c16f97b45f8a | f88197465ea7920adcdbec7375364d82 | delivered | 2017-11-18 19:28:06 | 2017-11-18 19:45:59 | 2017-11-22 13:39:59 | 2017-12-02 00:28:42 | 2017-12-15 |
| 4 | ad21c59c0840e6cb83a9ceb5573f8159 | 8ab97904e6daea8866dbdbc4fb7aad2c | delivered | 2018-02-13 21:18:39 | 2018-02-13 22:20:29 | 2018-02-14 19:46:34 | 2018-02-16 18:17:02 | 2018-02-26 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 99436 | 9c5dedf39a927c1b2549525ed64a053c | 39bd1228ee8140590ac3aca26f2dfe00 | delivered | 2017-03-09 09:54:05 | 2017-03-09 09:54:05 | 2017-03-10 11:18:03 | 2017-03-17 15:08:01 | 2017-03-28 |
| 99437 | 63943bddc261676b46f01ca7ac2f7bd8 | 1fca14ff2861355f6e5f14306ff977a7 | delivered | 2018-02-06 12:58:58 | 2018-02-06 13:10:37 | 2018-02-07 23:22:42 | 2018-02-28 17:37:56 | 2018-03-02 |
| 99438 | 83c1379a015df1e13d02aae0204711ab | 1aa71eb042121263aafbe80c1b562c9c | delivered | 2017-08-27 14:46:43 | 2017-08-27 15:04:16 | 2017-08-28 20:52:26 | 2017-09-21 11:24:17 | 2017-09-27 |
| 99439 | 11c177c8e97725db2631073c19f07b62 | b331b74b18dc79bcdf6532d51e1637c1 | delivered | 2018-01-08 21:28:27 | 2018-01-08 21:36:21 | 2018-01-12 15:35:03 | 2018-01-25 23:32:54 | 2018-02-15 |
| 99440 | 66dea50a8b16d9b4dee7af250b4be1a5 | edb027a75a1449115f6b43211ae02a24 | delivered | 2018-03-08 20:57:30 | 2018-03-09 11:20:28 | 2018-03-09 22:11:59 | 2018-03-16 13:08:30 | 2018-04-03 |
99272 rows × 8 columns
customers_df = pd.read_csv('olist_customers_dataset.csv')
customers_df
| customer_id | customer_unique_id | customer_zip_code_prefix | customer_city | customer_state | |
|---|---|---|---|---|---|
| 0 | 06b8999e2fba1a1fbc88172c00ba8bc7 | 861eff4711a542e4b93843c6dd7febb0 | 14409 | franca | SP |
| 1 | 18955e83d337fd6b2def6b18a428ac77 | 290c77bc529b7ac935b93aa66c333dc3 | 9790 | sao bernardo do campo | SP |
| 2 | 4e7b3e00288586ebd08712fdd0374a03 | 060e732b5b29e8181a18229c7b0b2b5e | 1151 | sao paulo | SP |
| 3 | b2b6027bc5c5109e529d4dc6358b12c3 | 259dac757896d24d7702b9acbbff3f3c | 8775 | mogi das cruzes | SP |
| 4 | 4f2d8ab171c80ec8364f7c12e35b23ad | 345ecd01c38d18a9036ed96c73b8d066 | 13056 | campinas | SP |
| ... | ... | ... | ... | ... | ... |
| 99436 | 17ddf5dd5d51696bb3d7c6291687be6f | 1a29b476fee25c95fbafc67c5ac95cf8 | 3937 | sao paulo | SP |
| 99437 | e7b71a9017aa05c9a7fd292d714858e8 | d52a67c98be1cf6a5c84435bd38d095d | 6764 | taboao da serra | SP |
| 99438 | 5e28dfe12db7fb50a4b2f691faecea5e | e9f50caf99f032f0bf3c55141f019d99 | 60115 | fortaleza | CE |
| 99439 | 56b18e2166679b8a959d72dd06da27f9 | 73c2643a0a458b49f58cea58833b192e | 92120 | canoas | RS |
| 99440 | 274fa6071e5e17fe303b9748641082c8 | 84732c5050c01db9b23e19ba39899398 | 6703 | cotia | SP |
99441 rows × 5 columns
'''все поля заполнены'''
customers_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 99441 entries, 0 to 99440 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 customer_id 99441 non-null object 1 customer_unique_id 99441 non-null object 2 customer_zip_code_prefix 99441 non-null int64 3 customer_city 99441 non-null object 4 customer_state 99441 non-null object dtypes: int64(1), object(4) memory usage: 3.8+ MB
'''дубликатов нет'''
customers_df.duplicated().sum()
0
orders_customers_final = orders_final.merge(customers_df, how = 'left', on = 'customer_id')
orders_customers_final
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | customer_unique_id | customer_zip_code_prefix | customer_city | customer_state | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 | 7c396fd4830fd04220f754e42b4e5bff | 3149 | sao paulo | SP |
| 1 | 53cdb2fc8bc7dce0b6741e2150273451 | b0830fb4747a6c6d20dea0b8c802d7ef | delivered | 2018-07-24 20:41:37 | 2018-07-26 03:24:27 | 2018-07-26 14:31:00 | 2018-08-07 15:27:45 | 2018-08-13 | af07308b275d755c9edb36a90c618231 | 47813 | barreiras | BA |
| 2 | 47770eb9100c2d0c44946d9cf07ec65d | 41ce2a54c0b03bf3443c3d931a367089 | delivered | 2018-08-08 08:38:49 | 2018-08-08 08:55:23 | 2018-08-08 13:50:00 | 2018-08-17 18:06:29 | 2018-09-04 | 3a653a41f6f9fc3d2a113cf8398680e8 | 75265 | vianopolis | GO |
| 3 | 949d5b44dbf5de918fe9c16f97b45f8a | f88197465ea7920adcdbec7375364d82 | delivered | 2017-11-18 19:28:06 | 2017-11-18 19:45:59 | 2017-11-22 13:39:59 | 2017-12-02 00:28:42 | 2017-12-15 | 7c142cf63193a1473d2e66489a9ae977 | 59296 | sao goncalo do amarante | RN |
| 4 | ad21c59c0840e6cb83a9ceb5573f8159 | 8ab97904e6daea8866dbdbc4fb7aad2c | delivered | 2018-02-13 21:18:39 | 2018-02-13 22:20:29 | 2018-02-14 19:46:34 | 2018-02-16 18:17:02 | 2018-02-26 | 72632f0f9dd73dfee390c9b22eb56dd6 | 9195 | santo andre | SP |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 99267 | 9c5dedf39a927c1b2549525ed64a053c | 39bd1228ee8140590ac3aca26f2dfe00 | delivered | 2017-03-09 09:54:05 | 2017-03-09 09:54:05 | 2017-03-10 11:18:03 | 2017-03-17 15:08:01 | 2017-03-28 | 6359f309b166b0196dbf7ad2ac62bb5a | 12209 | sao jose dos campos | SP |
| 99268 | 63943bddc261676b46f01ca7ac2f7bd8 | 1fca14ff2861355f6e5f14306ff977a7 | delivered | 2018-02-06 12:58:58 | 2018-02-06 13:10:37 | 2018-02-07 23:22:42 | 2018-02-28 17:37:56 | 2018-03-02 | da62f9e57a76d978d02ab5362c509660 | 11722 | praia grande | SP |
| 99269 | 83c1379a015df1e13d02aae0204711ab | 1aa71eb042121263aafbe80c1b562c9c | delivered | 2017-08-27 14:46:43 | 2017-08-27 15:04:16 | 2017-08-28 20:52:26 | 2017-09-21 11:24:17 | 2017-09-27 | 737520a9aad80b3fbbdad19b66b37b30 | 45920 | nova vicosa | BA |
| 99270 | 11c177c8e97725db2631073c19f07b62 | b331b74b18dc79bcdf6532d51e1637c1 | delivered | 2018-01-08 21:28:27 | 2018-01-08 21:36:21 | 2018-01-12 15:35:03 | 2018-01-25 23:32:54 | 2018-02-15 | 5097a5312c8b157bb7be58ae360ef43c | 28685 | japuiba | RJ |
| 99271 | 66dea50a8b16d9b4dee7af250b4be1a5 | edb027a75a1449115f6b43211ae02a24 | delivered | 2018-03-08 20:57:30 | 2018-03-09 11:20:28 | 2018-03-09 22:11:59 | 2018-03-16 13:08:30 | 2018-04-03 | 60350aa974b26ff12caad89e55993bd6 | 83750 | lapa | PR |
99272 rows × 12 columns
orders_customers_final.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 99272 entries, 0 to 99271 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 order_id 99272 non-null object 1 customer_id 99272 non-null object 2 order_status 99272 non-null object 3 order_purchase_timestamp 99272 non-null datetime64[ns] 4 order_approved_at 99272 non-null datetime64[ns] 5 order_delivered_carrier_date 97637 non-null datetime64[ns] 6 order_delivered_customer_date 96461 non-null datetime64[ns] 7 order_estimated_delivery_date 99272 non-null datetime64[ns] 8 customer_unique_id 99272 non-null object 9 customer_zip_code_prefix 99272 non-null int64 10 customer_city 99272 non-null object 11 customer_state 99272 non-null object dtypes: datetime64[ns](5), int64(1), object(6) memory usage: 9.8+ MB
fig = px.histogram(orders_customers_final.customer_unique_id.value_counts())
fig.show()
'''убедимся еще раз, сделав расчеты'''
(orders_customers_final.groupby('customer_unique_id').order_id.count()==1).sum()
93040
orders_df
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | |
|---|---|---|---|---|---|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 |
| 1 | 53cdb2fc8bc7dce0b6741e2150273451 | b0830fb4747a6c6d20dea0b8c802d7ef | delivered | 2018-07-24 20:41:37 | 2018-07-26 03:24:27 | 2018-07-26 14:31:00 | 2018-08-07 15:27:45 | 2018-08-13 |
| 2 | 47770eb9100c2d0c44946d9cf07ec65d | 41ce2a54c0b03bf3443c3d931a367089 | delivered | 2018-08-08 08:38:49 | 2018-08-08 08:55:23 | 2018-08-08 13:50:00 | 2018-08-17 18:06:29 | 2018-09-04 |
| 3 | 949d5b44dbf5de918fe9c16f97b45f8a | f88197465ea7920adcdbec7375364d82 | delivered | 2017-11-18 19:28:06 | 2017-11-18 19:45:59 | 2017-11-22 13:39:59 | 2017-12-02 00:28:42 | 2017-12-15 |
| 4 | ad21c59c0840e6cb83a9ceb5573f8159 | 8ab97904e6daea8866dbdbc4fb7aad2c | delivered | 2018-02-13 21:18:39 | 2018-02-13 22:20:29 | 2018-02-14 19:46:34 | 2018-02-16 18:17:02 | 2018-02-26 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 99436 | 9c5dedf39a927c1b2549525ed64a053c | 39bd1228ee8140590ac3aca26f2dfe00 | delivered | 2017-03-09 09:54:05 | 2017-03-09 09:54:05 | 2017-03-10 11:18:03 | 2017-03-17 15:08:01 | 2017-03-28 |
| 99437 | 63943bddc261676b46f01ca7ac2f7bd8 | 1fca14ff2861355f6e5f14306ff977a7 | delivered | 2018-02-06 12:58:58 | 2018-02-06 13:10:37 | 2018-02-07 23:22:42 | 2018-02-28 17:37:56 | 2018-03-02 |
| 99438 | 83c1379a015df1e13d02aae0204711ab | 1aa71eb042121263aafbe80c1b562c9c | delivered | 2017-08-27 14:46:43 | 2017-08-27 15:04:16 | 2017-08-28 20:52:26 | 2017-09-21 11:24:17 | 2017-09-27 |
| 99439 | 11c177c8e97725db2631073c19f07b62 | b331b74b18dc79bcdf6532d51e1637c1 | delivered | 2018-01-08 21:28:27 | 2018-01-08 21:36:21 | 2018-01-12 15:35:03 | 2018-01-25 23:32:54 | 2018-02-15 |
| 99440 | 66dea50a8b16d9b4dee7af250b4be1a5 | edb027a75a1449115f6b43211ae02a24 | delivered | 2018-03-08 20:57:30 | 2018-03-09 11:20:28 | 2018-03-09 22:11:59 | 2018-03-16 13:08:30 | 2018-04-03 |
99441 rows × 8 columns
'''оставим только те заказы, где дата доставки отсутствует'''
orders_not_delivered = orders_df[orders_df.order_delivered_customer_date.isnull()]
'''удалим заказы со статусом "доставлен", это скорее всего баг'''
orders_not_delivered = orders_not_delivered.query('order_status != "delivered"')
orders_not_delivered
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | |
|---|---|---|---|---|---|---|---|---|
| 6 | 136cce7faa42fdb2cefd53fdc79a6098 | ed0271e0b7da060a393796590e7b737a | invoiced | 2017-04-11 12:22:08 | 2017-04-13 13:25:17 | NaT | NaT | 2017-05-09 |
| 44 | ee64d42b8cf066f35eac1cf57de1aa85 | caded193e8e47b8362864762a83db3c5 | shipped | 2018-06-04 16:44:48 | 2018-06-05 04:31:18 | 2018-06-05 14:32:00 | NaT | 2018-06-28 |
| 103 | 0760a852e4e9d89eb77bf631eaaf1c84 | d2a79636084590b7465af8ab374a8cf5 | invoiced | 2018-08-03 17:44:42 | 2018-08-07 06:15:14 | NaT | NaT | 2018-08-21 |
| 128 | 15bed8e2fec7fdbadb186b57c46c92f2 | f3f0e613e0bdb9c7cee75504f0f90679 | processing | 2017-09-03 14:22:03 | 2017-09-03 14:30:09 | NaT | NaT | 2017-10-03 |
| 154 | 6942b8da583c2f9957e990d028607019 | 52006a9383bf149a4fb24226b173106f | shipped | 2018-01-10 11:33:07 | 2018-01-11 02:32:30 | 2018-01-11 19:39:23 | NaT | 2018-02-07 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 99283 | 3a3cddda5a7c27851bd96c3313412840 | 0b0d6095c5555fe083844281f6b093bb | canceled | 2018-08-31 16:13:44 | NaT | NaT | NaT | 2018-10-01 |
| 99313 | e9e64a17afa9653aacf2616d94c005b8 | b4cd0522e632e481f8eaf766a2646e86 | processing | 2018-01-05 23:07:24 | 2018-01-09 07:18:05 | NaT | NaT | 2018-02-06 |
| 99347 | a89abace0dcc01eeb267a9660b5ac126 | 2f0524a7b1b3845a1a57fcf3910c4333 | canceled | 2018-09-06 18:45:47 | NaT | NaT | NaT | 2018-09-27 |
| 99348 | a69ba794cc7deb415c3e15a0a3877e69 | 726f0894b5becdf952ea537d5266e543 | unavailable | 2017-08-23 16:28:04 | 2017-08-28 15:44:47 | NaT | NaT | 2017-09-15 |
| 99415 | 5fabc81b6322c8443648e1b21a6fef21 | 32c9df889d41b0ee8309a5efb6855dcb | unavailable | 2017-10-10 10:50:03 | 2017-10-14 18:35:57 | NaT | NaT | 2017-10-23 |
2957 rows × 8 columns
'''cоздадим дополнительную колонку с месяцем и годом планируемой даты доставки'''
orders_not_delivered['month_year'] = orders_not_delivered['order_estimated_delivery_date'].dt.to_period('M')
orders_not_delivered
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | month_year | |
|---|---|---|---|---|---|---|---|---|---|
| 6 | 136cce7faa42fdb2cefd53fdc79a6098 | ed0271e0b7da060a393796590e7b737a | invoiced | 2017-04-11 12:22:08 | 2017-04-13 13:25:17 | NaT | NaT | 2017-05-09 | 2017-05 |
| 44 | ee64d42b8cf066f35eac1cf57de1aa85 | caded193e8e47b8362864762a83db3c5 | shipped | 2018-06-04 16:44:48 | 2018-06-05 04:31:18 | 2018-06-05 14:32:00 | NaT | 2018-06-28 | 2018-06 |
| 103 | 0760a852e4e9d89eb77bf631eaaf1c84 | d2a79636084590b7465af8ab374a8cf5 | invoiced | 2018-08-03 17:44:42 | 2018-08-07 06:15:14 | NaT | NaT | 2018-08-21 | 2018-08 |
| 128 | 15bed8e2fec7fdbadb186b57c46c92f2 | f3f0e613e0bdb9c7cee75504f0f90679 | processing | 2017-09-03 14:22:03 | 2017-09-03 14:30:09 | NaT | NaT | 2017-10-03 | 2017-10 |
| 154 | 6942b8da583c2f9957e990d028607019 | 52006a9383bf149a4fb24226b173106f | shipped | 2018-01-10 11:33:07 | 2018-01-11 02:32:30 | 2018-01-11 19:39:23 | NaT | 2018-02-07 | 2018-02 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 99283 | 3a3cddda5a7c27851bd96c3313412840 | 0b0d6095c5555fe083844281f6b093bb | canceled | 2018-08-31 16:13:44 | NaT | NaT | NaT | 2018-10-01 | 2018-10 |
| 99313 | e9e64a17afa9653aacf2616d94c005b8 | b4cd0522e632e481f8eaf766a2646e86 | processing | 2018-01-05 23:07:24 | 2018-01-09 07:18:05 | NaT | NaT | 2018-02-06 | 2018-02 |
| 99347 | a89abace0dcc01eeb267a9660b5ac126 | 2f0524a7b1b3845a1a57fcf3910c4333 | canceled | 2018-09-06 18:45:47 | NaT | NaT | NaT | 2018-09-27 | 2018-09 |
| 99348 | a69ba794cc7deb415c3e15a0a3877e69 | 726f0894b5becdf952ea537d5266e543 | unavailable | 2017-08-23 16:28:04 | 2017-08-28 15:44:47 | NaT | NaT | 2017-09-15 | 2017-09 |
| 99415 | 5fabc81b6322c8443648e1b21a6fef21 | 32c9df889d41b0ee8309a5efb6855dcb | unavailable | 2017-10-10 10:50:03 | 2017-10-14 18:35:57 | NaT | NaT | 2017-10-23 | 2017-10 |
2957 rows × 9 columns
'''сгруппируем датафрейм по месяцам и статусам заказов'''
orders_not_delivered = orders_not_delivered.groupby(['month_year', 'order_status'], as_index = False) \
.agg({'order_id': 'count'})
orders_not_delivered
| month_year | order_status | order_id | |
|---|---|---|---|
| 0 | 2016-09 | canceled | 1 |
| 1 | 2016-10 | canceled | 3 |
| 2 | 2016-10 | shipped | 1 |
| 3 | 2016-11 | canceled | 10 |
| 4 | 2016-11 | invoiced | 9 |
| ... | ... | ... | ... |
| 110 | 2018-08 | unavailable | 22 |
| 111 | 2018-09 | canceled | 49 |
| 112 | 2018-09 | shipped | 4 |
| 113 | 2018-10 | canceled | 12 |
| 114 | 2018-11 | canceled | 1 |
115 rows × 3 columns
'''сделаем сводную таблицу'''
orders_not_delivered_pivot = orders_not_delivered.pivot(index = 'month_year', columns = 'order_status', values = 'order_id').fillna(0)
'''в итоге имеем таблицу следующего вида. В каждом месяце каждого года наглядно можно увидеть количество недоставленных заказов
в разрезе их статусов'''
orders_not_delivered_pivot
| order_status | approved | canceled | created | invoiced | processing | shipped | unavailable |
|---|---|---|---|---|---|---|---|
| month_year | |||||||
| 2016-09 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2016-10 | 0.0 | 3.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 2016-11 | 0.0 | 10.0 | 0.0 | 9.0 | 0.0 | 1.0 | 5.0 |
| 2016-12 | 0.0 | 7.0 | 0.0 | 9.0 | 2.0 | 7.0 | 2.0 |
| 2017-02 | 0.0 | 2.0 | 0.0 | 1.0 | 1.0 | 6.0 | 2.0 |
| 2017-03 | 1.0 | 26.0 | 0.0 | 21.0 | 38.0 | 41.0 | 66.0 |
| 2017-04 | 0.0 | 22.0 | 0.0 | 5.0 | 23.0 | 35.0 | 19.0 |
| 2017-05 | 1.0 | 23.0 | 0.0 | 19.0 | 12.0 | 60.0 | 16.0 |
| 2017-06 | 0.0 | 29.0 | 0.0 | 11.0 | 22.0 | 52.0 | 29.0 |
| 2017-07 | 0.0 | 21.0 | 0.0 | 10.0 | 13.0 | 51.0 | 34.0 |
| 2017-08 | 0.0 | 29.0 | 0.0 | 11.0 | 13.0 | 48.0 | 43.0 |
| 2017-09 | 0.0 | 26.0 | 0.0 | 21.0 | 17.0 | 49.0 | 35.0 |
| 2017-10 | 0.0 | 18.0 | 0.0 | 17.0 | 25.0 | 33.0 | 40.0 |
| 2017-11 | 0.0 | 35.0 | 0.0 | 13.0 | 17.0 | 37.0 | 72.0 |
| 2017-12 | 0.0 | 27.0 | 2.0 | 38.0 | 28.0 | 75.0 | 69.0 |
| 2018-01 | 0.0 | 7.0 | 2.0 | 10.0 | 28.0 | 51.0 | 44.0 |
| 2018-02 | 0.0 | 32.0 | 0.0 | 11.0 | 34.0 | 58.0 | 40.0 |
| 2018-03 | 0.0 | 87.0 | 1.0 | 11.0 | 6.0 | 83.0 | 34.0 |
| 2018-04 | 0.0 | 17.0 | 0.0 | 24.0 | 10.0 | 126.0 | 11.0 |
| 2018-05 | 0.0 | 26.0 | 0.0 | 12.0 | 6.0 | 112.0 | 12.0 |
| 2018-06 | 0.0 | 11.0 | 0.0 | 24.0 | 5.0 | 36.0 | 7.0 |
| 2018-07 | 0.0 | 26.0 | 0.0 | 3.0 | 0.0 | 51.0 | 7.0 |
| 2018-08 | 0.0 | 72.0 | 0.0 | 34.0 | 1.0 | 90.0 | 22.0 |
| 2018-09 | 0.0 | 49.0 | 0.0 | 0.0 | 0.0 | 4.0 | 0.0 |
| 2018-10 | 0.0 | 12.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2018-11 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
orders_not_delivered_pivot.mean().sum()
113.73076923076923
orders_not_delivered_pivot.mean()
order_status approved 0.076923 canceled 23.807692 created 0.192308 invoiced 12.076923 processing 11.576923 shipped 42.576923 unavailable 23.423077 dtype: float64
'''будем использовать первоначальный датафрейм с заказами'''
orders_df
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | |
|---|---|---|---|---|---|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 |
| 1 | 53cdb2fc8bc7dce0b6741e2150273451 | b0830fb4747a6c6d20dea0b8c802d7ef | delivered | 2018-07-24 20:41:37 | 2018-07-26 03:24:27 | 2018-07-26 14:31:00 | 2018-08-07 15:27:45 | 2018-08-13 |
| 2 | 47770eb9100c2d0c44946d9cf07ec65d | 41ce2a54c0b03bf3443c3d931a367089 | delivered | 2018-08-08 08:38:49 | 2018-08-08 08:55:23 | 2018-08-08 13:50:00 | 2018-08-17 18:06:29 | 2018-09-04 |
| 3 | 949d5b44dbf5de918fe9c16f97b45f8a | f88197465ea7920adcdbec7375364d82 | delivered | 2017-11-18 19:28:06 | 2017-11-18 19:45:59 | 2017-11-22 13:39:59 | 2017-12-02 00:28:42 | 2017-12-15 |
| 4 | ad21c59c0840e6cb83a9ceb5573f8159 | 8ab97904e6daea8866dbdbc4fb7aad2c | delivered | 2018-02-13 21:18:39 | 2018-02-13 22:20:29 | 2018-02-14 19:46:34 | 2018-02-16 18:17:02 | 2018-02-26 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 99436 | 9c5dedf39a927c1b2549525ed64a053c | 39bd1228ee8140590ac3aca26f2dfe00 | delivered | 2017-03-09 09:54:05 | 2017-03-09 09:54:05 | 2017-03-10 11:18:03 | 2017-03-17 15:08:01 | 2017-03-28 |
| 99437 | 63943bddc261676b46f01ca7ac2f7bd8 | 1fca14ff2861355f6e5f14306ff977a7 | delivered | 2018-02-06 12:58:58 | 2018-02-06 13:10:37 | 2018-02-07 23:22:42 | 2018-02-28 17:37:56 | 2018-03-02 |
| 99438 | 83c1379a015df1e13d02aae0204711ab | 1aa71eb042121263aafbe80c1b562c9c | delivered | 2017-08-27 14:46:43 | 2017-08-27 15:04:16 | 2017-08-28 20:52:26 | 2017-09-21 11:24:17 | 2017-09-27 |
| 99439 | 11c177c8e97725db2631073c19f07b62 | b331b74b18dc79bcdf6532d51e1637c1 | delivered | 2018-01-08 21:28:27 | 2018-01-08 21:36:21 | 2018-01-12 15:35:03 | 2018-01-25 23:32:54 | 2018-02-15 |
| 99440 | 66dea50a8b16d9b4dee7af250b4be1a5 | edb027a75a1449115f6b43211ae02a24 | delivered | 2018-03-08 20:57:30 | 2018-03-09 11:20:28 | 2018-03-09 22:11:59 | 2018-03-16 13:08:30 | 2018-04-03 |
99441 rows × 8 columns
'''напомню, что у нас есть заказы, которые мы посчитали багами, удалим их'''
bug_delivered = orders_df.fillna(0) \
.query("order_status == 'delivered' and (order_approved_at == 0 or order_delivered_carrier_date == 0 or order_delivered_customer_date == 0)") \
.order_id.to_list()
'''удаляем сомнительные 'доставленные' заказы из датафрейма'''
orders_task2_variant2 = orders_df[~orders_df['order_id'].isin(bug_delivered)]
'''посмотрим на статусы заказов. Оставляем только 'delivered', 'unavailable' и 'canceled. Остальные нас не интересуют.'''
orders_task2_variant2.order_status.unique()
array(['delivered', 'invoiced', 'shipped', 'processing', 'unavailable',
'canceled', 'created', 'approved'], dtype=object)
orders_task2_variant2 = orders_task2_variant2.query('order_status == "unavailable" or order_status == "canceled" or order_status == "delivered"')
'''создадим доп столбцы с месяцем даты доставки и обещанной даты доставки'''
orders_task2_variant2['order_delivered_customer_month'] = orders_task2_variant2['order_delivered_customer_date'].dt.to_period('M')
orders_task2_variant2['order_estimated_delivery_month'] = orders_task2_variant2['order_estimated_delivery_date'].dt.to_period('M')
'''теперь напишем функцию, которую будем использовать при создании столбца с причиной, по которой товар не доставлен вовремя.
А именно: если заказ отменен или недоступен, так и указываем. Если же заказ доставлен, но не в обещанном месяце, то смотрим
на даты. Либо его поздно передали на доставку, либо передали вовремя, а вот доставляли долго.'''
def reasons(df):
if pd.isna(df['order_delivered_customer_date']):
return df['order_status']
elif df['order_delivered_customer_month'] > df['order_estimated_delivery_month']:
if df['order_delivered_carrier_date'] > df['order_estimated_delivery_date']:
return 'поздно передан в доставку'
else:
return 'не доставлен к обещанной дате'
orders_task2_variant2 = orders_task2_variant2.assign(reason=orders_task2_variant2.apply(reasons, axis=1))
orders_task2_variant2
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | order_delivered_customer_month | order_estimated_delivery_month | reason | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 | 2017-10 | 2017-10 | None |
| 1 | 53cdb2fc8bc7dce0b6741e2150273451 | b0830fb4747a6c6d20dea0b8c802d7ef | delivered | 2018-07-24 20:41:37 | 2018-07-26 03:24:27 | 2018-07-26 14:31:00 | 2018-08-07 15:27:45 | 2018-08-13 | 2018-08 | 2018-08 | None |
| 2 | 47770eb9100c2d0c44946d9cf07ec65d | 41ce2a54c0b03bf3443c3d931a367089 | delivered | 2018-08-08 08:38:49 | 2018-08-08 08:55:23 | 2018-08-08 13:50:00 | 2018-08-17 18:06:29 | 2018-09-04 | 2018-08 | 2018-09 | None |
| 3 | 949d5b44dbf5de918fe9c16f97b45f8a | f88197465ea7920adcdbec7375364d82 | delivered | 2017-11-18 19:28:06 | 2017-11-18 19:45:59 | 2017-11-22 13:39:59 | 2017-12-02 00:28:42 | 2017-12-15 | 2017-12 | 2017-12 | None |
| 4 | ad21c59c0840e6cb83a9ceb5573f8159 | 8ab97904e6daea8866dbdbc4fb7aad2c | delivered | 2018-02-13 21:18:39 | 2018-02-13 22:20:29 | 2018-02-14 19:46:34 | 2018-02-16 18:17:02 | 2018-02-26 | 2018-02 | 2018-02 | None |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 99436 | 9c5dedf39a927c1b2549525ed64a053c | 39bd1228ee8140590ac3aca26f2dfe00 | delivered | 2017-03-09 09:54:05 | 2017-03-09 09:54:05 | 2017-03-10 11:18:03 | 2017-03-17 15:08:01 | 2017-03-28 | 2017-03 | 2017-03 | None |
| 99437 | 63943bddc261676b46f01ca7ac2f7bd8 | 1fca14ff2861355f6e5f14306ff977a7 | delivered | 2018-02-06 12:58:58 | 2018-02-06 13:10:37 | 2018-02-07 23:22:42 | 2018-02-28 17:37:56 | 2018-03-02 | 2018-02 | 2018-03 | None |
| 99438 | 83c1379a015df1e13d02aae0204711ab | 1aa71eb042121263aafbe80c1b562c9c | delivered | 2017-08-27 14:46:43 | 2017-08-27 15:04:16 | 2017-08-28 20:52:26 | 2017-09-21 11:24:17 | 2017-09-27 | 2017-09 | 2017-09 | None |
| 99439 | 11c177c8e97725db2631073c19f07b62 | b331b74b18dc79bcdf6532d51e1637c1 | delivered | 2018-01-08 21:28:27 | 2018-01-08 21:36:21 | 2018-01-12 15:35:03 | 2018-01-25 23:32:54 | 2018-02-15 | 2018-01 | 2018-02 | None |
| 99440 | 66dea50a8b16d9b4dee7af250b4be1a5 | edb027a75a1449115f6b43211ae02a24 | delivered | 2018-03-08 20:57:30 | 2018-03-09 11:20:28 | 2018-03-09 22:11:59 | 2018-03-16 13:08:30 | 2018-04-03 | 2018-03 | 2018-04 | None |
97689 rows × 11 columns
'''теперь уберем из таблицы все, что не нужно, оставив только наши не вовремя доставленные/не доставленные заказы'''
orders_task2_variant2 = orders_task2_variant2[orders_task2_variant2['reason'].notna()]
orders_task2_variant2.reason.unique()
array(['поздно передан в доставку', 'не доставлен к обещанной дате',
'unavailable', 'canceled'], dtype=object)
'''сгруппируем датафрейм по месяцам и причинам'''
orders_task2_variant2 = orders_task2_variant2.groupby(['order_estimated_delivery_month', 'reason'], as_index = False) \
.agg({'order_id': 'count'})
orders_task2_variant2_pivot = orders_task2_variant2.pivot(index = 'order_estimated_delivery_month', columns = 'reason', values = 'order_id').fillna(0)
orders_task2_variant2_pivot
| reason | canceled | unavailable | не доставлен к обещанной дате | поздно передан в доставку |
|---|---|---|---|---|
| order_estimated_delivery_month | ||||
| 2016-09 | 1.0 | 0.0 | 0.0 | 0.0 |
| 2016-10 | 3.0 | 0.0 | 0.0 | 1.0 |
| 2016-11 | 10.0 | 5.0 | 0.0 | 1.0 |
| 2016-12 | 7.0 | 2.0 | 0.0 | 0.0 |
| 2017-02 | 2.0 | 2.0 | 9.0 | 1.0 |
| 2017-03 | 26.0 | 66.0 | 25.0 | 10.0 |
| 2017-04 | 22.0 | 19.0 | 38.0 | 0.0 |
| 2017-05 | 23.0 | 16.0 | 33.0 | 4.0 |
| 2017-06 | 29.0 | 29.0 | 19.0 | 4.0 |
| 2017-07 | 21.0 | 34.0 | 21.0 | 1.0 |
| 2017-08 | 29.0 | 43.0 | 37.0 | 0.0 |
| 2017-09 | 26.0 | 35.0 | 41.0 | 7.0 |
| 2017-10 | 18.0 | 40.0 | 39.0 | 4.0 |
| 2017-11 | 35.0 | 72.0 | 45.0 | 13.0 |
| 2017-12 | 27.0 | 69.0 | 292.0 | 30.0 |
| 2018-01 | 7.0 | 44.0 | 54.0 | 13.0 |
| 2018-02 | 32.0 | 40.0 | 105.0 | 10.0 |
| 2018-03 | 87.0 | 34.0 | 621.0 | 10.0 |
| 2018-04 | 17.0 | 11.0 | 203.0 | 2.0 |
| 2018-05 | 26.0 | 12.0 | 161.0 | 5.0 |
| 2018-06 | 11.0 | 7.0 | 14.0 | 2.0 |
| 2018-07 | 26.0 | 7.0 | 43.0 | 11.0 |
| 2018-08 | 72.0 | 22.0 | 40.0 | 11.0 |
| 2018-09 | 49.0 | 0.0 | 0.0 | 0.0 |
| 2018-10 | 12.0 | 0.0 | 0.0 | 0.0 |
| 2018-11 | 1.0 | 0.0 | 0.0 | 0.0 |
orders_task2_variant2_pivot.sum()
reason canceled 619.0 unavailable 609.0 не доставлен к обещанной дате 1840.0 поздно передан в доставку 140.0 dtype: float64
orders_task2_variant2_pivot.mean()
reason canceled 23.807692 unavailable 23.423077 не доставлен к обещанной дате 70.769231 поздно передан в доставку 5.384615 dtype: float64
orders_final
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | |
|---|---|---|---|---|---|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 |
| 1 | 53cdb2fc8bc7dce0b6741e2150273451 | b0830fb4747a6c6d20dea0b8c802d7ef | delivered | 2018-07-24 20:41:37 | 2018-07-26 03:24:27 | 2018-07-26 14:31:00 | 2018-08-07 15:27:45 | 2018-08-13 |
| 2 | 47770eb9100c2d0c44946d9cf07ec65d | 41ce2a54c0b03bf3443c3d931a367089 | delivered | 2018-08-08 08:38:49 | 2018-08-08 08:55:23 | 2018-08-08 13:50:00 | 2018-08-17 18:06:29 | 2018-09-04 |
| 3 | 949d5b44dbf5de918fe9c16f97b45f8a | f88197465ea7920adcdbec7375364d82 | delivered | 2017-11-18 19:28:06 | 2017-11-18 19:45:59 | 2017-11-22 13:39:59 | 2017-12-02 00:28:42 | 2017-12-15 |
| 4 | ad21c59c0840e6cb83a9ceb5573f8159 | 8ab97904e6daea8866dbdbc4fb7aad2c | delivered | 2018-02-13 21:18:39 | 2018-02-13 22:20:29 | 2018-02-14 19:46:34 | 2018-02-16 18:17:02 | 2018-02-26 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 99436 | 9c5dedf39a927c1b2549525ed64a053c | 39bd1228ee8140590ac3aca26f2dfe00 | delivered | 2017-03-09 09:54:05 | 2017-03-09 09:54:05 | 2017-03-10 11:18:03 | 2017-03-17 15:08:01 | 2017-03-28 |
| 99437 | 63943bddc261676b46f01ca7ac2f7bd8 | 1fca14ff2861355f6e5f14306ff977a7 | delivered | 2018-02-06 12:58:58 | 2018-02-06 13:10:37 | 2018-02-07 23:22:42 | 2018-02-28 17:37:56 | 2018-03-02 |
| 99438 | 83c1379a015df1e13d02aae0204711ab | 1aa71eb042121263aafbe80c1b562c9c | delivered | 2017-08-27 14:46:43 | 2017-08-27 15:04:16 | 2017-08-28 20:52:26 | 2017-09-21 11:24:17 | 2017-09-27 |
| 99439 | 11c177c8e97725db2631073c19f07b62 | b331b74b18dc79bcdf6532d51e1637c1 | delivered | 2018-01-08 21:28:27 | 2018-01-08 21:36:21 | 2018-01-12 15:35:03 | 2018-01-25 23:32:54 | 2018-02-15 |
| 99440 | 66dea50a8b16d9b4dee7af250b4be1a5 | edb027a75a1449115f6b43211ae02a24 | delivered | 2018-03-08 20:57:30 | 2018-03-09 11:20:28 | 2018-03-09 22:11:59 | 2018-03-16 13:08:30 | 2018-04-03 |
99272 rows × 8 columns
goods_df = pd.read_csv('olist_order_items_dataset.csv', parse_dates = ['shipping_limit_date'])
goods_df
| order_id | order_item_id | product_id | seller_id | shipping_limit_date | price | freight_value | |
|---|---|---|---|---|---|---|---|
| 0 | 00010242fe8c5a6d1ba2dd792cb16214 | 1 | 4244733e06e7ecb4970a6e2683c13e61 | 48436dade18ac8b2bce089ec2a041202 | 2017-09-19 09:45:35 | 58.90 | 13.29 |
| 1 | 00018f77f2f0320c557190d7a144bdd3 | 1 | e5f2d52b802189ee658865ca93d83a8f | dd7ddc04e1b6c2c614352b383efe2d36 | 2017-05-03 11:05:13 | 239.90 | 19.93 |
| 2 | 000229ec398224ef6ca0657da4fc703e | 1 | c777355d18b72b67abbeef9df44fd0fd | 5b51032eddd242adc84c38acab88f23d | 2018-01-18 14:48:30 | 199.00 | 17.87 |
| 3 | 00024acbcdf0a6daa1e931b038114c75 | 1 | 7634da152a4610f1595efa32f14722fc | 9d7a1d34a5052409006425275ba1c2b4 | 2018-08-15 10:10:18 | 12.99 | 12.79 |
| 4 | 00042b26cf59d7ce69dfabb4e55b4fd9 | 1 | ac6c3623068f30de03045865e4e10089 | df560393f3a51e74553ab94004ba5c87 | 2017-02-13 13:57:51 | 199.90 | 18.14 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 112645 | fffc94f6ce00a00581880bf54a75a037 | 1 | 4aa6014eceb682077f9dc4bffebc05b0 | b8bc237ba3788b23da09c0f1f3a3288c | 2018-05-02 04:11:01 | 299.99 | 43.41 |
| 112646 | fffcd46ef2263f404302a634eb57f7eb | 1 | 32e07fd915822b0765e448c4dd74c828 | f3c38ab652836d21de61fb8314b69182 | 2018-07-20 04:31:48 | 350.00 | 36.53 |
| 112647 | fffce4705a9662cd70adb13d4a31832d | 1 | 72a30483855e2eafc67aee5dc2560482 | c3cfdc648177fdbbbb35635a37472c53 | 2017-10-30 17:14:25 | 99.90 | 16.95 |
| 112648 | fffe18544ffabc95dfada21779c9644f | 1 | 9c422a519119dcad7575db5af1ba540e | 2b3e4a2a3ea8e01938cabda2a3e5cc79 | 2017-08-21 00:04:32 | 55.99 | 8.72 |
| 112649 | fffe41c64501cc87c801fd61db3f6244 | 1 | 350688d9dc1e75ff97be326363655e01 | f7ccf836d21b2fb1de37564105216cc1 | 2018-06-12 17:10:13 | 43.00 | 12.79 |
112650 rows × 7 columns
'''пропусков нет, дубликатов нет, типы данных верные'''
goods_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 112650 entries, 0 to 112649 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 order_id 112650 non-null object 1 order_item_id 112650 non-null int64 2 product_id 112650 non-null object 3 seller_id 112650 non-null object 4 shipping_limit_date 112650 non-null datetime64[ns] 5 price 112650 non-null float64 6 freight_value 112650 non-null float64 dtypes: datetime64[ns](1), float64(2), int64(1), object(3) memory usage: 6.0+ MB
goods_df.duplicated().sum()
0
goods_df.merge(orders_final, how = 'left', on = 'order_id')
| order_id | order_item_id | product_id | seller_id | shipping_limit_date | price | freight_value | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 00010242fe8c5a6d1ba2dd792cb16214 | 1 | 4244733e06e7ecb4970a6e2683c13e61 | 48436dade18ac8b2bce089ec2a041202 | 2017-09-19 09:45:35 | 58.90 | 13.29 | 3ce436f183e68e07877b285a838db11a | delivered | 2017-09-13 08:59:02 | 2017-09-13 09:45:35 | 2017-09-19 18:34:16 | 2017-09-20 23:43:48 | 2017-09-29 |
| 1 | 00018f77f2f0320c557190d7a144bdd3 | 1 | e5f2d52b802189ee658865ca93d83a8f | dd7ddc04e1b6c2c614352b383efe2d36 | 2017-05-03 11:05:13 | 239.90 | 19.93 | f6dd3ec061db4e3987629fe6b26e5cce | delivered | 2017-04-26 10:53:06 | 2017-04-26 11:05:13 | 2017-05-04 14:35:00 | 2017-05-12 16:04:24 | 2017-05-15 |
| 2 | 000229ec398224ef6ca0657da4fc703e | 1 | c777355d18b72b67abbeef9df44fd0fd | 5b51032eddd242adc84c38acab88f23d | 2018-01-18 14:48:30 | 199.00 | 17.87 | 6489ae5e4333f3693df5ad4372dab6d3 | delivered | 2018-01-14 14:33:31 | 2018-01-14 14:48:30 | 2018-01-16 12:36:48 | 2018-01-22 13:19:16 | 2018-02-05 |
| 3 | 00024acbcdf0a6daa1e931b038114c75 | 1 | 7634da152a4610f1595efa32f14722fc | 9d7a1d34a5052409006425275ba1c2b4 | 2018-08-15 10:10:18 | 12.99 | 12.79 | d4eb9395c8c0431ee92fce09860c5a06 | delivered | 2018-08-08 10:00:35 | 2018-08-08 10:10:18 | 2018-08-10 13:28:00 | 2018-08-14 13:32:39 | 2018-08-20 |
| 4 | 00042b26cf59d7ce69dfabb4e55b4fd9 | 1 | ac6c3623068f30de03045865e4e10089 | df560393f3a51e74553ab94004ba5c87 | 2017-02-13 13:57:51 | 199.90 | 18.14 | 58dbd0b2d70206bf40e62cd34e84d795 | delivered | 2017-02-04 13:57:51 | 2017-02-04 14:10:13 | 2017-02-16 09:46:09 | 2017-03-01 16:42:31 | 2017-03-17 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 112645 | fffc94f6ce00a00581880bf54a75a037 | 1 | 4aa6014eceb682077f9dc4bffebc05b0 | b8bc237ba3788b23da09c0f1f3a3288c | 2018-05-02 04:11:01 | 299.99 | 43.41 | b51593916b4b8e0d6f66f2ae24f2673d | delivered | 2018-04-23 13:57:06 | 2018-04-25 04:11:01 | 2018-04-25 12:09:00 | 2018-05-10 22:56:40 | 2018-05-18 |
| 112646 | fffcd46ef2263f404302a634eb57f7eb | 1 | 32e07fd915822b0765e448c4dd74c828 | f3c38ab652836d21de61fb8314b69182 | 2018-07-20 04:31:48 | 350.00 | 36.53 | 84c5d4fbaf120aae381fad077416eaa0 | delivered | 2018-07-14 10:26:46 | 2018-07-17 04:31:48 | 2018-07-17 08:05:00 | 2018-07-23 20:31:55 | 2018-08-01 |
| 112647 | fffce4705a9662cd70adb13d4a31832d | 1 | 72a30483855e2eafc67aee5dc2560482 | c3cfdc648177fdbbbb35635a37472c53 | 2017-10-30 17:14:25 | 99.90 | 16.95 | 29309aa813182aaddc9b259e31b870e6 | delivered | 2017-10-23 17:07:56 | 2017-10-24 17:14:25 | 2017-10-26 15:13:14 | 2017-10-28 12:22:22 | 2017-11-10 |
| 112648 | fffe18544ffabc95dfada21779c9644f | 1 | 9c422a519119dcad7575db5af1ba540e | 2b3e4a2a3ea8e01938cabda2a3e5cc79 | 2017-08-21 00:04:32 | 55.99 | 8.72 | b5e6afd5a41800fdf401e0272ca74655 | delivered | 2017-08-14 23:02:59 | 2017-08-15 00:04:32 | 2017-08-15 19:02:53 | 2017-08-16 21:59:40 | 2017-08-25 |
| 112649 | fffe41c64501cc87c801fd61db3f6244 | 1 | 350688d9dc1e75ff97be326363655e01 | f7ccf836d21b2fb1de37564105216cc1 | 2018-06-12 17:10:13 | 43.00 | 12.79 | 96d649da0cc4ff33bb408b199d4c7dcf | delivered | 2018-06-09 17:00:18 | 2018-06-09 17:10:13 | 2018-06-11 14:11:00 | 2018-06-14 17:56:26 | 2018-06-28 |
112650 rows × 14 columns
'''после всех манипуляций в итоговой таблице у нас должно остаться 112626 строк'''
goods_with_dates = goods_df.merge(orders_final, how = 'left', on = 'order_id')
goods_with_dates.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 112650 entries, 0 to 112649 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 order_id 112650 non-null object 1 order_item_id 112650 non-null int64 2 product_id 112650 non-null object 3 seller_id 112650 non-null object 4 shipping_limit_date 112650 non-null datetime64[ns] 5 price 112650 non-null float64 6 freight_value 112650 non-null float64 7 customer_id 112626 non-null object 8 order_status 112626 non-null object 9 order_purchase_timestamp 112626 non-null datetime64[ns] 10 order_approved_at 112626 non-null datetime64[ns] 11 order_delivered_carrier_date 111434 non-null datetime64[ns] 12 order_delivered_customer_date 110180 non-null datetime64[ns] 13 order_estimated_delivery_date 112626 non-null datetime64[ns] dtypes: datetime64[ns](6), float64(2), int64(1), object(5) memory usage: 12.9+ MB
goods_with_dates = goods_with_dates.dropna(subset=['order_purchase_timestamp'], how='all')
goods_with_dates
| order_id | order_item_id | product_id | seller_id | shipping_limit_date | price | freight_value | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 00010242fe8c5a6d1ba2dd792cb16214 | 1 | 4244733e06e7ecb4970a6e2683c13e61 | 48436dade18ac8b2bce089ec2a041202 | 2017-09-19 09:45:35 | 58.90 | 13.29 | 3ce436f183e68e07877b285a838db11a | delivered | 2017-09-13 08:59:02 | 2017-09-13 09:45:35 | 2017-09-19 18:34:16 | 2017-09-20 23:43:48 | 2017-09-29 |
| 1 | 00018f77f2f0320c557190d7a144bdd3 | 1 | e5f2d52b802189ee658865ca93d83a8f | dd7ddc04e1b6c2c614352b383efe2d36 | 2017-05-03 11:05:13 | 239.90 | 19.93 | f6dd3ec061db4e3987629fe6b26e5cce | delivered | 2017-04-26 10:53:06 | 2017-04-26 11:05:13 | 2017-05-04 14:35:00 | 2017-05-12 16:04:24 | 2017-05-15 |
| 2 | 000229ec398224ef6ca0657da4fc703e | 1 | c777355d18b72b67abbeef9df44fd0fd | 5b51032eddd242adc84c38acab88f23d | 2018-01-18 14:48:30 | 199.00 | 17.87 | 6489ae5e4333f3693df5ad4372dab6d3 | delivered | 2018-01-14 14:33:31 | 2018-01-14 14:48:30 | 2018-01-16 12:36:48 | 2018-01-22 13:19:16 | 2018-02-05 |
| 3 | 00024acbcdf0a6daa1e931b038114c75 | 1 | 7634da152a4610f1595efa32f14722fc | 9d7a1d34a5052409006425275ba1c2b4 | 2018-08-15 10:10:18 | 12.99 | 12.79 | d4eb9395c8c0431ee92fce09860c5a06 | delivered | 2018-08-08 10:00:35 | 2018-08-08 10:10:18 | 2018-08-10 13:28:00 | 2018-08-14 13:32:39 | 2018-08-20 |
| 4 | 00042b26cf59d7ce69dfabb4e55b4fd9 | 1 | ac6c3623068f30de03045865e4e10089 | df560393f3a51e74553ab94004ba5c87 | 2017-02-13 13:57:51 | 199.90 | 18.14 | 58dbd0b2d70206bf40e62cd34e84d795 | delivered | 2017-02-04 13:57:51 | 2017-02-04 14:10:13 | 2017-02-16 09:46:09 | 2017-03-01 16:42:31 | 2017-03-17 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 112645 | fffc94f6ce00a00581880bf54a75a037 | 1 | 4aa6014eceb682077f9dc4bffebc05b0 | b8bc237ba3788b23da09c0f1f3a3288c | 2018-05-02 04:11:01 | 299.99 | 43.41 | b51593916b4b8e0d6f66f2ae24f2673d | delivered | 2018-04-23 13:57:06 | 2018-04-25 04:11:01 | 2018-04-25 12:09:00 | 2018-05-10 22:56:40 | 2018-05-18 |
| 112646 | fffcd46ef2263f404302a634eb57f7eb | 1 | 32e07fd915822b0765e448c4dd74c828 | f3c38ab652836d21de61fb8314b69182 | 2018-07-20 04:31:48 | 350.00 | 36.53 | 84c5d4fbaf120aae381fad077416eaa0 | delivered | 2018-07-14 10:26:46 | 2018-07-17 04:31:48 | 2018-07-17 08:05:00 | 2018-07-23 20:31:55 | 2018-08-01 |
| 112647 | fffce4705a9662cd70adb13d4a31832d | 1 | 72a30483855e2eafc67aee5dc2560482 | c3cfdc648177fdbbbb35635a37472c53 | 2017-10-30 17:14:25 | 99.90 | 16.95 | 29309aa813182aaddc9b259e31b870e6 | delivered | 2017-10-23 17:07:56 | 2017-10-24 17:14:25 | 2017-10-26 15:13:14 | 2017-10-28 12:22:22 | 2017-11-10 |
| 112648 | fffe18544ffabc95dfada21779c9644f | 1 | 9c422a519119dcad7575db5af1ba540e | 2b3e4a2a3ea8e01938cabda2a3e5cc79 | 2017-08-21 00:04:32 | 55.99 | 8.72 | b5e6afd5a41800fdf401e0272ca74655 | delivered | 2017-08-14 23:02:59 | 2017-08-15 00:04:32 | 2017-08-15 19:02:53 | 2017-08-16 21:59:40 | 2017-08-25 |
| 112649 | fffe41c64501cc87c801fd61db3f6244 | 1 | 350688d9dc1e75ff97be326363655e01 | f7ccf836d21b2fb1de37564105216cc1 | 2018-06-12 17:10:13 | 43.00 | 12.79 | 96d649da0cc4ff33bb408b199d4c7dcf | delivered | 2018-06-09 17:00:18 | 2018-06-09 17:10:13 | 2018-06-11 14:11:00 | 2018-06-14 17:56:26 | 2018-06-28 |
112626 rows × 14 columns
'''создадим колонку с названием дня недели, в который был заказан товар'''
goods_with_dates['day_of_week'] = goods_with_dates.order_purchase_timestamp.dt.day_name()
goods_with_dates
| order_id | order_item_id | product_id | seller_id | shipping_limit_date | price | freight_value | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | day_of_week | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 00010242fe8c5a6d1ba2dd792cb16214 | 1 | 4244733e06e7ecb4970a6e2683c13e61 | 48436dade18ac8b2bce089ec2a041202 | 2017-09-19 09:45:35 | 58.90 | 13.29 | 3ce436f183e68e07877b285a838db11a | delivered | 2017-09-13 08:59:02 | 2017-09-13 09:45:35 | 2017-09-19 18:34:16 | 2017-09-20 23:43:48 | 2017-09-29 | Wednesday |
| 1 | 00018f77f2f0320c557190d7a144bdd3 | 1 | e5f2d52b802189ee658865ca93d83a8f | dd7ddc04e1b6c2c614352b383efe2d36 | 2017-05-03 11:05:13 | 239.90 | 19.93 | f6dd3ec061db4e3987629fe6b26e5cce | delivered | 2017-04-26 10:53:06 | 2017-04-26 11:05:13 | 2017-05-04 14:35:00 | 2017-05-12 16:04:24 | 2017-05-15 | Wednesday |
| 2 | 000229ec398224ef6ca0657da4fc703e | 1 | c777355d18b72b67abbeef9df44fd0fd | 5b51032eddd242adc84c38acab88f23d | 2018-01-18 14:48:30 | 199.00 | 17.87 | 6489ae5e4333f3693df5ad4372dab6d3 | delivered | 2018-01-14 14:33:31 | 2018-01-14 14:48:30 | 2018-01-16 12:36:48 | 2018-01-22 13:19:16 | 2018-02-05 | Sunday |
| 3 | 00024acbcdf0a6daa1e931b038114c75 | 1 | 7634da152a4610f1595efa32f14722fc | 9d7a1d34a5052409006425275ba1c2b4 | 2018-08-15 10:10:18 | 12.99 | 12.79 | d4eb9395c8c0431ee92fce09860c5a06 | delivered | 2018-08-08 10:00:35 | 2018-08-08 10:10:18 | 2018-08-10 13:28:00 | 2018-08-14 13:32:39 | 2018-08-20 | Wednesday |
| 4 | 00042b26cf59d7ce69dfabb4e55b4fd9 | 1 | ac6c3623068f30de03045865e4e10089 | df560393f3a51e74553ab94004ba5c87 | 2017-02-13 13:57:51 | 199.90 | 18.14 | 58dbd0b2d70206bf40e62cd34e84d795 | delivered | 2017-02-04 13:57:51 | 2017-02-04 14:10:13 | 2017-02-16 09:46:09 | 2017-03-01 16:42:31 | 2017-03-17 | Saturday |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 112645 | fffc94f6ce00a00581880bf54a75a037 | 1 | 4aa6014eceb682077f9dc4bffebc05b0 | b8bc237ba3788b23da09c0f1f3a3288c | 2018-05-02 04:11:01 | 299.99 | 43.41 | b51593916b4b8e0d6f66f2ae24f2673d | delivered | 2018-04-23 13:57:06 | 2018-04-25 04:11:01 | 2018-04-25 12:09:00 | 2018-05-10 22:56:40 | 2018-05-18 | Monday |
| 112646 | fffcd46ef2263f404302a634eb57f7eb | 1 | 32e07fd915822b0765e448c4dd74c828 | f3c38ab652836d21de61fb8314b69182 | 2018-07-20 04:31:48 | 350.00 | 36.53 | 84c5d4fbaf120aae381fad077416eaa0 | delivered | 2018-07-14 10:26:46 | 2018-07-17 04:31:48 | 2018-07-17 08:05:00 | 2018-07-23 20:31:55 | 2018-08-01 | Saturday |
| 112647 | fffce4705a9662cd70adb13d4a31832d | 1 | 72a30483855e2eafc67aee5dc2560482 | c3cfdc648177fdbbbb35635a37472c53 | 2017-10-30 17:14:25 | 99.90 | 16.95 | 29309aa813182aaddc9b259e31b870e6 | delivered | 2017-10-23 17:07:56 | 2017-10-24 17:14:25 | 2017-10-26 15:13:14 | 2017-10-28 12:22:22 | 2017-11-10 | Monday |
| 112648 | fffe18544ffabc95dfada21779c9644f | 1 | 9c422a519119dcad7575db5af1ba540e | 2b3e4a2a3ea8e01938cabda2a3e5cc79 | 2017-08-21 00:04:32 | 55.99 | 8.72 | b5e6afd5a41800fdf401e0272ca74655 | delivered | 2017-08-14 23:02:59 | 2017-08-15 00:04:32 | 2017-08-15 19:02:53 | 2017-08-16 21:59:40 | 2017-08-25 | Monday |
| 112649 | fffe41c64501cc87c801fd61db3f6244 | 1 | 350688d9dc1e75ff97be326363655e01 | f7ccf836d21b2fb1de37564105216cc1 | 2018-06-12 17:10:13 | 43.00 | 12.79 | 96d649da0cc4ff33bb408b199d4c7dcf | delivered | 2018-06-09 17:00:18 | 2018-06-09 17:10:13 | 2018-06-11 14:11:00 | 2018-06-14 17:56:26 | 2018-06-28 | Saturday |
112626 rows × 15 columns
'''сгруппируем датафрейм по id товара и дню недели, а затем посчитаем количество заказов каждого товара в каждый день недели.
Затем отсортируемся внутри каждого product_id по убыванию числа покупок. Сначала я планировала взять здесь первую строку
каждой группы и таким образом получить максимум. Но тогда получится, что если товар покупали одинаковое количество раз
в разные дни, то попадет только один день, что не совсем корректно. Поэтому пойдем другим путем.'''
products_with_days = goods_with_dates.groupby(['product_id', 'day_of_week'], as_index = False) \
.agg({'order_id':'count'}) \
.sort_values(['product_id','order_id'], ascending = [False,False]) \
.rename(columns = {'order_id':'number_of_purchases'})
products_with_days
| product_id | day_of_week | number_of_purchases | |
|---|---|---|---|
| 61215 | fffe9eeff12fcbd74a2f2b007dde0c58 | Wednesday | 1 |
| 61214 | fffdb2d0ec8d6a61f0a0a0db3f25b441 | Tuesday | 2 |
| 61211 | fffdb2d0ec8d6a61f0a0a0db3f25b441 | Friday | 1 |
| 61212 | fffdb2d0ec8d6a61f0a0a0db3f25b441 | Sunday | 1 |
| 61213 | fffdb2d0ec8d6a61f0a0a0db3f25b441 | Thursday | 1 |
| ... | ... | ... | ... |
| 3 | 000b8f95fcb9e0096488278317764d19 | Friday | 1 |
| 4 | 000b8f95fcb9e0096488278317764d19 | Wednesday | 1 |
| 2 | 0009406fd7479715e4bef61dd91f2462 | Thursday | 1 |
| 1 | 00088930e925c41fd95ebfe695fd2655 | Tuesday | 1 |
| 0 | 00066f42aeeb9f3007548bb9d3f33c38 | Sunday | 1 |
61216 rows × 3 columns
'''далее создадим дополнительную колонку с максимальным числом заказов для каждого товара'''
products_with_days['max_purchases'] = products_with_days.groupby('product_id')['number_of_purchases'].transform(max)
products_with_days
| product_id | day_of_week | number_of_purchases | max_purchases | |
|---|---|---|---|---|
| 61215 | fffe9eeff12fcbd74a2f2b007dde0c58 | Wednesday | 1 | 1 |
| 61214 | fffdb2d0ec8d6a61f0a0a0db3f25b441 | Tuesday | 2 | 2 |
| 61211 | fffdb2d0ec8d6a61f0a0a0db3f25b441 | Friday | 1 | 2 |
| 61212 | fffdb2d0ec8d6a61f0a0a0db3f25b441 | Sunday | 1 | 2 |
| 61213 | fffdb2d0ec8d6a61f0a0a0db3f25b441 | Thursday | 1 | 2 |
| ... | ... | ... | ... | ... |
| 3 | 000b8f95fcb9e0096488278317764d19 | Friday | 1 | 1 |
| 4 | 000b8f95fcb9e0096488278317764d19 | Wednesday | 1 | 1 |
| 2 | 0009406fd7479715e4bef61dd91f2462 | Thursday | 1 | 1 |
| 1 | 00088930e925c41fd95ebfe695fd2655 | Tuesday | 1 | 1 |
| 0 | 00066f42aeeb9f3007548bb9d3f33c38 | Sunday | 1 | 1 |
61216 rows × 4 columns
'''теперь отберем строки, в котрых число покупок соответствует максимуму. Если их несколько, создадим список из подходящих
дней недели'''
comparing = np.where(products_with_days["number_of_purchases"] == products_with_days["max_purchases"], True, False)
products_day_max = products_with_days[comparing]
products_day_max
| product_id | day_of_week | number_of_purchases | max_purchases | |
|---|---|---|---|---|
| 61215 | fffe9eeff12fcbd74a2f2b007dde0c58 | Wednesday | 1 | 1 |
| 61214 | fffdb2d0ec8d6a61f0a0a0db3f25b441 | Tuesday | 2 | 2 |
| 61210 | fff9553ac224cec9d15d49f5a263411f | Friday | 1 | 1 |
| 61209 | fff81cc3158d2725c0655ab9ba0f712c | Monday | 1 | 1 |
| 61207 | fff6177642830a9a94a0f2cba5e476d1 | Saturday | 1 | 1 |
| ... | ... | ... | ... | ... |
| 3 | 000b8f95fcb9e0096488278317764d19 | Friday | 1 | 1 |
| 4 | 000b8f95fcb9e0096488278317764d19 | Wednesday | 1 | 1 |
| 2 | 0009406fd7479715e4bef61dd91f2462 | Thursday | 1 | 1 |
| 1 | 00088930e925c41fd95ebfe695fd2655 | Tuesday | 1 | 1 |
| 0 | 00066f42aeeb9f3007548bb9d3f33c38 | Sunday | 1 | 1 |
42697 rows × 4 columns
products_day_max = products_day_max.groupby('product_id').day_of_week.apply(list).to_frame().reset_index()
products_day_max
| product_id | day_of_week | |
|---|---|---|
| 0 | 00066f42aeeb9f3007548bb9d3f33c38 | [Sunday] |
| 1 | 00088930e925c41fd95ebfe695fd2655 | [Tuesday] |
| 2 | 0009406fd7479715e4bef61dd91f2462 | [Thursday] |
| 3 | 000b8f95fcb9e0096488278317764d19 | [Friday, Wednesday] |
| 4 | 000d9be29b5207b54e86aa1b1ac54872 | [Tuesday] |
| ... | ... | ... |
| 32940 | fff6177642830a9a94a0f2cba5e476d1 | [Saturday, Sunday] |
| 32941 | fff81cc3158d2725c0655ab9ba0f712c | [Monday] |
| 32942 | fff9553ac224cec9d15d49f5a263411f | [Friday] |
| 32943 | fffdb2d0ec8d6a61f0a0a0db3f25b441 | [Tuesday] |
| 32944 | fffe9eeff12fcbd74a2f2b007dde0c58 | [Wednesday] |
32945 rows × 2 columns
'''ну и просто уберем списки, чтобы было крсивее'''
products_day_max['day_of_week'] = products_day_max.day_of_week.apply(lambda x: ', '.join(x))
products_day_max
| product_id | day_of_week | |
|---|---|---|
| 0 | 00066f42aeeb9f3007548bb9d3f33c38 | Sunday |
| 1 | 00088930e925c41fd95ebfe695fd2655 | Tuesday |
| 2 | 0009406fd7479715e4bef61dd91f2462 | Thursday |
| 3 | 000b8f95fcb9e0096488278317764d19 | Friday, Wednesday |
| 4 | 000d9be29b5207b54e86aa1b1ac54872 | Tuesday |
| ... | ... | ... |
| 32940 | fff6177642830a9a94a0f2cba5e476d1 | Saturday, Sunday |
| 32941 | fff81cc3158d2725c0655ab9ba0f712c | Monday |
| 32942 | fff9553ac224cec9d15d49f5a263411f | Friday |
| 32943 | fffdb2d0ec8d6a61f0a0a0db3f25b441 | Tuesday |
| 32944 | fffe9eeff12fcbd74a2f2b007dde0c58 | Wednesday |
32945 rows × 2 columns
orders_customers_final
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | customer_unique_id | customer_zip_code_prefix | customer_city | customer_state | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 | 7c396fd4830fd04220f754e42b4e5bff | 3149 | sao paulo | SP |
| 1 | 53cdb2fc8bc7dce0b6741e2150273451 | b0830fb4747a6c6d20dea0b8c802d7ef | delivered | 2018-07-24 20:41:37 | 2018-07-26 03:24:27 | 2018-07-26 14:31:00 | 2018-08-07 15:27:45 | 2018-08-13 | af07308b275d755c9edb36a90c618231 | 47813 | barreiras | BA |
| 2 | 47770eb9100c2d0c44946d9cf07ec65d | 41ce2a54c0b03bf3443c3d931a367089 | delivered | 2018-08-08 08:38:49 | 2018-08-08 08:55:23 | 2018-08-08 13:50:00 | 2018-08-17 18:06:29 | 2018-09-04 | 3a653a41f6f9fc3d2a113cf8398680e8 | 75265 | vianopolis | GO |
| 3 | 949d5b44dbf5de918fe9c16f97b45f8a | f88197465ea7920adcdbec7375364d82 | delivered | 2017-11-18 19:28:06 | 2017-11-18 19:45:59 | 2017-11-22 13:39:59 | 2017-12-02 00:28:42 | 2017-12-15 | 7c142cf63193a1473d2e66489a9ae977 | 59296 | sao goncalo do amarante | RN |
| 4 | ad21c59c0840e6cb83a9ceb5573f8159 | 8ab97904e6daea8866dbdbc4fb7aad2c | delivered | 2018-02-13 21:18:39 | 2018-02-13 22:20:29 | 2018-02-14 19:46:34 | 2018-02-16 18:17:02 | 2018-02-26 | 72632f0f9dd73dfee390c9b22eb56dd6 | 9195 | santo andre | SP |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 99267 | 9c5dedf39a927c1b2549525ed64a053c | 39bd1228ee8140590ac3aca26f2dfe00 | delivered | 2017-03-09 09:54:05 | 2017-03-09 09:54:05 | 2017-03-10 11:18:03 | 2017-03-17 15:08:01 | 2017-03-28 | 6359f309b166b0196dbf7ad2ac62bb5a | 12209 | sao jose dos campos | SP |
| 99268 | 63943bddc261676b46f01ca7ac2f7bd8 | 1fca14ff2861355f6e5f14306ff977a7 | delivered | 2018-02-06 12:58:58 | 2018-02-06 13:10:37 | 2018-02-07 23:22:42 | 2018-02-28 17:37:56 | 2018-03-02 | da62f9e57a76d978d02ab5362c509660 | 11722 | praia grande | SP |
| 99269 | 83c1379a015df1e13d02aae0204711ab | 1aa71eb042121263aafbe80c1b562c9c | delivered | 2017-08-27 14:46:43 | 2017-08-27 15:04:16 | 2017-08-28 20:52:26 | 2017-09-21 11:24:17 | 2017-09-27 | 737520a9aad80b3fbbdad19b66b37b30 | 45920 | nova vicosa | BA |
| 99270 | 11c177c8e97725db2631073c19f07b62 | b331b74b18dc79bcdf6532d51e1637c1 | delivered | 2018-01-08 21:28:27 | 2018-01-08 21:36:21 | 2018-01-12 15:35:03 | 2018-01-25 23:32:54 | 2018-02-15 | 5097a5312c8b157bb7be58ae360ef43c | 28685 | japuiba | RJ |
| 99271 | 66dea50a8b16d9b4dee7af250b4be1a5 | edb027a75a1449115f6b43211ae02a24 | delivered | 2018-03-08 20:57:30 | 2018-03-09 11:20:28 | 2018-03-09 22:11:59 | 2018-03-16 13:08:30 | 2018-04-03 | 60350aa974b26ff12caad89e55993bd6 | 83750 | lapa | PR |
99272 rows × 12 columns
'''создадим колонку с месяцем и годом, чтобы потом по ним сгруппировать дф и посчитать число покупок в каждом месяце'''
orders_customers_final['month_year'] = orders_customers_final['order_purchase_timestamp'].dt.to_period('M')
'''теперь создадим дф, в котором для каждого уникального клиента будет видно число покупок в каждом месяце'''
grouped_orders = orders_customers_final.groupby(['customer_unique_id','month_year'], as_index = False) \
.agg({'order_id':'count'}) \
.sort_values(['customer_unique_id','order_id'], ascending = [False,False])
grouped_orders
| customer_unique_id | month_year | order_id | |
|---|---|---|---|
| 97906 | ffffd2657e2aad2907e67c3e9daecbeb | 2017-05 | 1 |
| 97905 | ffff5962728ec6157033ef9805bacc48 | 2018-05 | 1 |
| 97904 | ffff371b4d645b6ecea244b27531430a | 2017-02 | 1 |
| 97903 | fffea47cd6d3cc0a88bd621562a9d061 | 2017-12 | 1 |
| 97902 | fffcf5a5ff07b0908bd4e2dbc735a684 | 2017-06 | 1 |
| ... | ... | ... | ... |
| 4 | 0004aac84e0df4da2b147fca70cf8255 | 2017-11 | 1 |
| 3 | 0000f6ccb0745a6a4b88665a16c9f078 | 2017-10 | 1 |
| 2 | 0000f46a3911fa3c0805444483337064 | 2017-03 | 1 |
| 1 | 0000b849f77a49e4a4ce2b2a4ca5be3f | 2018-05 | 1 |
| 0 | 0000366f3b9a7992bf8c76cfdf3221e2 | 2018-05 | 1 |
97907 rows × 3 columns
'''создадим колонку с числом дней и недель в каждом месяце'''
grouped_orders['days_in_month'] = grouped_orders.month_year.dt.daysinmonth
grouped_orders['weeks_in_month'] = grouped_orders['days_in_month'] / 7
grouped_orders = grouped_orders.rename(columns = {'month_year': 'date', 'order_id': 'purchases'})
'''создадим колонку со средним числом покупок в неделю по месяцам. Для этого число покупок в месяц для каждого покупателя
разделим на число недель в месяце'''
grouped_orders['average_number_of_purchases'] = grouped_orders['purchases'] / grouped_orders['weeks_in_month']
grouped_orders
| customer_unique_id | date | purchases | days_in_month | weeks_in_month | average_number_of_purchases | |
|---|---|---|---|---|---|---|
| 97906 | ffffd2657e2aad2907e67c3e9daecbeb | 2017-05 | 1 | 31 | 4.428571 | 0.225806 |
| 97905 | ffff5962728ec6157033ef9805bacc48 | 2018-05 | 1 | 31 | 4.428571 | 0.225806 |
| 97904 | ffff371b4d645b6ecea244b27531430a | 2017-02 | 1 | 28 | 4.000000 | 0.250000 |
| 97903 | fffea47cd6d3cc0a88bd621562a9d061 | 2017-12 | 1 | 31 | 4.428571 | 0.225806 |
| 97902 | fffcf5a5ff07b0908bd4e2dbc735a684 | 2017-06 | 1 | 30 | 4.285714 | 0.233333 |
| ... | ... | ... | ... | ... | ... | ... |
| 4 | 0004aac84e0df4da2b147fca70cf8255 | 2017-11 | 1 | 30 | 4.285714 | 0.233333 |
| 3 | 0000f6ccb0745a6a4b88665a16c9f078 | 2017-10 | 1 | 31 | 4.428571 | 0.225806 |
| 2 | 0000f46a3911fa3c0805444483337064 | 2017-03 | 1 | 31 | 4.428571 | 0.225806 |
| 1 | 0000b849f77a49e4a4ce2b2a4ca5be3f | 2018-05 | 1 | 31 | 4.428571 | 0.225806 |
| 0 | 0000366f3b9a7992bf8c76cfdf3221e2 | 2018-05 | 1 | 31 | 4.428571 | 0.225806 |
97907 rows × 6 columns
'''оставим только нужные колонки'''
grouped_orders_final = grouped_orders[['customer_unique_id', 'date', 'average_number_of_purchases']]
'''вот так выглядит финальный датафрейм. В нем мы оставили только необходимую информацию, а именно уникальный id пользователя,
месяц и среднее число покупок в неделю в месяце'''
grouped_orders_final.sort_values('average_number_of_purchases', ascending = False)
| customer_unique_id | date | average_number_of_purchases | |
|---|---|---|---|
| 7315 | 12f5d6e1cbf93dafd9dcc19095df0b3d | 2017-01 | 1.354839 |
| 69298 | b4e4f24de1e8725b74e4a1f4975116ed | 2018-02 | 1.000000 |
| 23906 | 3e43e6105506432c953e165fb2acf44c | 2018-02 | 1.000000 |
| 62111 | a239b8e2fbce33780f1f1912e2ee5275 | 2017-02 | 1.000000 |
| 14468 | 25a560b9a6006157838aab1bdbd68624 | 2017-04 | 0.933333 |
| ... | ... | ... | ... |
| 57156 | 958a09f1a17bfaa6ca721b5aa01f59f6 | 2018-05 | 0.225806 |
| 57153 | 958843e128a10a8ae35155b6fc57fec3 | 2018-03 | 0.225806 |
| 57152 | 9585898e80f9de26178eb04cd600b6f5 | 2017-08 | 0.225806 |
| 57150 | 95834cf255d8c39ea415b7e35272ca13 | 2017-12 | 0.225806 |
| 0 | 0000366f3b9a7992bf8c76cfdf3221e2 | 2018-05 | 0.225806 |
97907 rows × 3 columns
'''при необходимости можно отфильтровать дф по id пользователя и посмотреть необходимую информацию'''
grouped_orders_final.query('customer_unique_id == "8d50f5eadf50201ccdcedfb9e2ac8455"')
| customer_unique_id | date | average_number_of_purchases | |
|---|---|---|---|
| 53960 | 8d50f5eadf50201ccdcedfb9e2ac8455 | 2017-07 | 0.677419 |
| 53967 | 8d50f5eadf50201ccdcedfb9e2ac8455 | 2018-08 | 0.677419 |
| 53963 | 8d50f5eadf50201ccdcedfb9e2ac8455 | 2017-10 | 0.451613 |
| 53965 | 8d50f5eadf50201ccdcedfb9e2ac8455 | 2018-05 | 0.451613 |
| 53966 | 8d50f5eadf50201ccdcedfb9e2ac8455 | 2018-07 | 0.451613 |
| 53958 | 8d50f5eadf50201ccdcedfb9e2ac8455 | 2017-05 | 0.225806 |
| 53959 | 8d50f5eadf50201ccdcedfb9e2ac8455 | 2017-06 | 0.233333 |
| 53961 | 8d50f5eadf50201ccdcedfb9e2ac8455 | 2017-08 | 0.225806 |
| 53962 | 8d50f5eadf50201ccdcedfb9e2ac8455 | 2017-09 | 0.233333 |
| 53964 | 8d50f5eadf50201ccdcedfb9e2ac8455 | 2017-11 | 0.233333 |
orders_customers_final
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | customer_unique_id | customer_zip_code_prefix | customer_city | customer_state | month_year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 | 7c396fd4830fd04220f754e42b4e5bff | 3149 | sao paulo | SP | 2017-10 |
| 1 | 53cdb2fc8bc7dce0b6741e2150273451 | b0830fb4747a6c6d20dea0b8c802d7ef | delivered | 2018-07-24 20:41:37 | 2018-07-26 03:24:27 | 2018-07-26 14:31:00 | 2018-08-07 15:27:45 | 2018-08-13 | af07308b275d755c9edb36a90c618231 | 47813 | barreiras | BA | 2018-07 |
| 2 | 47770eb9100c2d0c44946d9cf07ec65d | 41ce2a54c0b03bf3443c3d931a367089 | delivered | 2018-08-08 08:38:49 | 2018-08-08 08:55:23 | 2018-08-08 13:50:00 | 2018-08-17 18:06:29 | 2018-09-04 | 3a653a41f6f9fc3d2a113cf8398680e8 | 75265 | vianopolis | GO | 2018-08 |
| 3 | 949d5b44dbf5de918fe9c16f97b45f8a | f88197465ea7920adcdbec7375364d82 | delivered | 2017-11-18 19:28:06 | 2017-11-18 19:45:59 | 2017-11-22 13:39:59 | 2017-12-02 00:28:42 | 2017-12-15 | 7c142cf63193a1473d2e66489a9ae977 | 59296 | sao goncalo do amarante | RN | 2017-11 |
| 4 | ad21c59c0840e6cb83a9ceb5573f8159 | 8ab97904e6daea8866dbdbc4fb7aad2c | delivered | 2018-02-13 21:18:39 | 2018-02-13 22:20:29 | 2018-02-14 19:46:34 | 2018-02-16 18:17:02 | 2018-02-26 | 72632f0f9dd73dfee390c9b22eb56dd6 | 9195 | santo andre | SP | 2018-02 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 99267 | 9c5dedf39a927c1b2549525ed64a053c | 39bd1228ee8140590ac3aca26f2dfe00 | delivered | 2017-03-09 09:54:05 | 2017-03-09 09:54:05 | 2017-03-10 11:18:03 | 2017-03-17 15:08:01 | 2017-03-28 | 6359f309b166b0196dbf7ad2ac62bb5a | 12209 | sao jose dos campos | SP | 2017-03 |
| 99268 | 63943bddc261676b46f01ca7ac2f7bd8 | 1fca14ff2861355f6e5f14306ff977a7 | delivered | 2018-02-06 12:58:58 | 2018-02-06 13:10:37 | 2018-02-07 23:22:42 | 2018-02-28 17:37:56 | 2018-03-02 | da62f9e57a76d978d02ab5362c509660 | 11722 | praia grande | SP | 2018-02 |
| 99269 | 83c1379a015df1e13d02aae0204711ab | 1aa71eb042121263aafbe80c1b562c9c | delivered | 2017-08-27 14:46:43 | 2017-08-27 15:04:16 | 2017-08-28 20:52:26 | 2017-09-21 11:24:17 | 2017-09-27 | 737520a9aad80b3fbbdad19b66b37b30 | 45920 | nova vicosa | BA | 2017-08 |
| 99270 | 11c177c8e97725db2631073c19f07b62 | b331b74b18dc79bcdf6532d51e1637c1 | delivered | 2018-01-08 21:28:27 | 2018-01-08 21:36:21 | 2018-01-12 15:35:03 | 2018-01-25 23:32:54 | 2018-02-15 | 5097a5312c8b157bb7be58ae360ef43c | 28685 | japuiba | RJ | 2018-01 |
| 99271 | 66dea50a8b16d9b4dee7af250b4be1a5 | edb027a75a1449115f6b43211ae02a24 | delivered | 2018-03-08 20:57:30 | 2018-03-09 11:20:28 | 2018-03-09 22:11:59 | 2018-03-16 13:08:30 | 2018-04-03 | 60350aa974b26ff12caad89e55993bd6 | 83750 | lapa | PR | 2018-03 |
99272 rows × 13 columns
'''оставим только нужные для разделения пользователей на когорты колонки датафрейма'''
orders_for_cohorts = orders_customers_final[['order_purchase_timestamp', 'customer_unique_id', 'order_id', 'month_year']]
'''оставим только те заказы, которые входят в интересующий нас временной интервал'''
orders_for_cohorts = orders_for_cohorts.query('month_year >= "2017-01" and month_year <= "2018-03"')
orders_for_cohorts
| order_purchase_timestamp | customer_unique_id | order_id | month_year | |
|---|---|---|---|---|
| 0 | 2017-10-02 10:56:33 | 7c396fd4830fd04220f754e42b4e5bff | e481f51cbdc54678b7cc49136f2d6af7 | 2017-10 |
| 3 | 2017-11-18 19:28:06 | 7c142cf63193a1473d2e66489a9ae977 | 949d5b44dbf5de918fe9c16f97b45f8a | 2017-11 |
| 4 | 2018-02-13 21:18:39 | 72632f0f9dd73dfee390c9b22eb56dd6 | ad21c59c0840e6cb83a9ceb5573f8159 | 2018-02 |
| 5 | 2017-07-09 21:57:05 | 80bb27c7c16e8f973207a5086ab329e2 | a4591c265e18cb1dcee52889e2d8acc3 | 2017-07 |
| 6 | 2017-04-11 12:22:08 | 36edbb3fb164b1f16485364b6fb04c73 | 136cce7faa42fdb2cefd53fdc79a6098 | 2017-04 |
| ... | ... | ... | ... | ... |
| 99267 | 2017-03-09 09:54:05 | 6359f309b166b0196dbf7ad2ac62bb5a | 9c5dedf39a927c1b2549525ed64a053c | 2017-03 |
| 99268 | 2018-02-06 12:58:58 | da62f9e57a76d978d02ab5362c509660 | 63943bddc261676b46f01ca7ac2f7bd8 | 2018-02 |
| 99269 | 2017-08-27 14:46:43 | 737520a9aad80b3fbbdad19b66b37b30 | 83c1379a015df1e13d02aae0204711ab | 2017-08 |
| 99270 | 2018-01-08 21:28:27 | 5097a5312c8b157bb7be58ae360ef43c | 11c177c8e97725db2631073c19f07b62 | 2018-01 |
| 99271 | 2018-03-08 20:57:30 | 60350aa974b26ff12caad89e55993bd6 | 66dea50a8b16d9b4dee7af250b4be1a5 | 2018-03 |
66226 rows × 4 columns
'''теперь уберем из датафрейма тех пользователей, для которых покупка в указанный период не первая. Для этого создадим
датафрейм с покупками за 2016 год, добавим id пользователей из него в список и по указанному списку отфильтруем наш
датафрейм для формирования когорт'''
orders_2016 = orders_customers_final.query('month_year < "2017-01"').customer_unique_id.to_list()
orders_2016
['07d5b1f9bdd053347f0213187b343ac9', '12cc890391e57d7123e004856ff1f137', '664f7107c351f24b16b68919398730a0', '1dc848c11c2985635c268ff3dea707eb', 'fb48b86d9a78b53675c121e4ec8981ed', 'bdb88619ffe5ca24fa5732468bd34573', '8130a5b332a233457aad5d824b9df2cb', '451c36d15ca9729da50f130f5c0f1c59', '4854e9b3feff728c13ee5fc7d1547e92', 'a6f05f77ad475fa43a454923be0c97e8', '2760aef7bd977a44aabf53f656e92722', 'b7d76e111c89f7ebf14761390f0f7d17', '94e5ea5a8c1bf546db2739673060c43f', 'f1efc19f85b4823a0bfde8f77f913207', 'a93b7ffb96986b7ae5bef50b9f789ed8', '28706b8dc0fe2a95fbcec1e53253d8e4', '7a2679d6c164f6b6a2f36daf1b2fbc9d', 'a5cad0a33504fb5b9725ac33ed686359', '4766fb51c287a3a3a67eae04ed0e75a4', 'ad1cb06f09b27e4773870d1f76a68557', '2552194579775fed6641fa5f56152e0a', '87776adb449c551e74c13fc34f036105', 'b10f780a5e22a8c7fbce07bb3da6eeff', '046ed33aa8bef51855af6010a11a835c', 'cb9f4b2fe91fdef805d9d0344b471246', 'b74deaa44a000da4ce9bf2913cb8e232', '18af4524eed79155034a9374880c5078', '8329519e31cb1b89bd44c3c6ae417ad6', '3b32327f677fba6212c2e8e13792da74', '6e4c8eed6842146f77d917f7b437b4c2', 'e8d4692a6aec2ab156ad409cd1a72bb7', '2f96790fba243100730c8d1185dd2f25', '72602f19e9db5d1f5245849d55794e52', '009b0127b727ab0ba422f6d9604487c7', 'ba1a0b654ccab39f3691fbc1b35c2499', '4910006053f92928b62910d568d4ad1e', 'b3724b93edd00e472e1db7017236f60a', '9783afad41afa667522bf03656c516f3', '71b35d5b0fa5a6a266dd6f103680009c', 'ee1b69039bd20877692d8622dedbc3ec', '4403513209514871ac9d0fd681f7cdb5', 'b2272a60af3b7f5afe04c6cbf1dcfb15', 'eeb1b66fee35b3f1e3dfc27ff80fc0a2', '5da3ad487547da6f4f3011269b38f324', '854e546b53f9b86fecc634f1cc7587cb', '6d0572dd5acc888fae43a2e5789bb990', 'e08b2dabdfac348704d3b031dfbf6d5f', 'be8375783b7439dd7ef30b89fb46188b', '8f800fc6d16456d0e952825096c0f40e', 'de51ac050e580a6f05fdf0d56fac2cef', '3f4f614c632af7fc7508462a7cb55ac2', '0d527a050dc34913c5c4746cefae5995', '5f61b10d70849d46be5643b7453e1708', '7bf845014243c08bdcea071d6981303e', '55d3b6aaa9c85f456862f7e2e270102b', 'f571289f61836e3356b8c679b0652f3a', '99e41e69b60bd63edd83935499523063', '3c34090736785a45dd8cbf536ad289c4', '8cd892a381c7d2f5532e23b83e509399', '2e629c69eaa172cb271dcf7f890f921a', 'd58063e0329c70d19363e6225ca43128', 'acf5cef98a4798f470a2bb70a330ac07', '381411bd0b3e1aff0b23bfed26d000af', 'bb5f2d03e1e945a1e5c1ab23c3de4c64', 'ac10064713bd3169f17aed85acad2106', '0032c76b20340da25249092a268ce66c', '22e16e7154401d6eb4f7ba788b3edb7e', '231fb63f9dc7482c7016020ef322b011', '6360ff9a3d58800ba0b03b1c6c290d92', '4dab471599dbcca61ac7f57baf6537c5', '1979dc402581a74e2bd0c0853951311d', 'ff39ae410be7f9b3a2457c546c6f67de', 'd51c96d2f1d710b94744d6f7b88387d7', 'f15a952dfc52308d0361288fbf42c7b3', '0acb54b64345b37d81205175f3f3e779', 'd7afcd4b6f0d645a8efa24bef0b66ca0', 'e6a1d274cbba12da4cd29eb462b34be5', 'b313b4a68d70a37718f0313f32789e2b', 'd6a930873ee3f80aa823cb1d4baca58d', '0829f7df6577d5a4b65439bea701405f', '5c418344883a1f8617cb5577648adb4a', 'e8798977c2a62ecf8f2b17fba406097f', '9342b18c97a2bbf3fa9d97aceb0b6dd4', '95c47b016360d9186ca9138abb0e080d', '7af2db50b8a1a1c7fdf77acb033ba6b0', '2e92a624a8a3f8ab5eb51d42fca60e5b', '3b6d8779a13d0e022d93c46e814de0fa', 'cfbb7c1c3d613d974eef515b180e2c9b', '7a176e5d88c7acb6000a055e008c055d', 'ef3508c4c50a1f7c885ea2ee92218f08', '4962136f755981b83b03f476e66eea7d', '2f64e403852e6893ae37485d5fcacdaf', 'ecf55c92c7bf73cc1d576a0f6a235b0b', '76cfb52cb2b6b08a3a01454ad83e73f1', '95e70fa5b9014a6a718b63350260356c', 'ff822e5e1322cef6c211ad8a0591999e', 'd98096b0c597ce4bf28a2839ad0f89fc', '06bdfbbe1857c3c925ec81abfb1c9666', '830d5b7aaa3b6f1e9ad63703bec97d23', '225e63e9d240db566203b1ed918a809c', '3ed4404759ed8860749e78c75e683180', 'f7b62c75467e8ce080b201667cbbc274', '7fab7e8b8aa68f7a538ca818b7b57322', 'd8e40a4b16990a6fd1095feae46aa1af', 'e75b9df782b21f27d61449864774108b', '8d36e7e56a9f8044df728e9cbf5a3eaf', 'e3299196e1482e1ae1320fcb594bc23e', 'aa6ea6f8307ee33b985192329cd8c79c', '33fbcb38d3009eddfe53b3063c642959', '45d9410bc1f3c6e36dea41d8f823ada6', '37d1cea760ff9b96298e678a10b02c85', 'b92a2e5e8a6eabcc80882c7d68b2c70b', '10e89fd8e5c745f81bec101207ba4d7d', '87b3f231705783eb2217e25851c0a45d', 'cdf2b280dbabe2b047ba03f1886265e6', '2557b9d6b7d5ca77ebc116062e558214', 'dc86b29986d1f8ad458040d2400bf941', '9f302d00dd3e18ed3745778184b4f0fe', 'd60a512747263c5307fef65a58ff7532', 'e55fbf4f9ca883b98aeaa2b149987027', 'd355afc67927984cd3dc896414197c47', '27ede105a6042b71a261bf57f75fdcf8', '6b681e2738264eb39f1da19f82919630', 'b997e25246ed06045ed885ca74385b4d', 'd7046ea561b819649385d6d524166f04', 'e93d7ba71f9edaa1360d2d16e8a95783', '4bb9eff279b892251d672d74a4272434', 'b9c47942414b4b3f08a3ae85f5c7e4d7', 'e77dc6d35eca2c309c97739d49994220', '9eecbc54a48d750ac6762ab31477fb9f', 'f7b981e8a280e455ac3cbe0d5d171bd1', '481c59a1f6bafb63320f101c013e6784', '2e15add79c76241beffe812a24ad3a31', 'e3626c985901df173504f3d285ac823f', '37751f6b47c9a1dc55fffde08f0183fc', 'eec95eabca2b8d033e1d36ad86ad688a', '36f1caf9bdbb0f6201dfdc27a08c6338', '522e714643e682c9eff7415cffae54fb', 'd78014098f313770fc276a44a9e84a87', 'd8cef4d551f57b2f2657619815495c56', 'f2fa99f89acf8b2c2f63775d218def26', 'c3e126b480a46d537bca487cc053912e', '36620f9fa97dadf4867e4e15e3488f48', '7f3ae1ea298c60b1a19751a1ded40199', '48eb05fb37e33f3079495d0145ef2711', '24c0806a2407ad1f14b1323849bd246c', 'f08dd31e4b7d59c4a80c1eba48355547', '1150664499f22ea0d558af89a2c053fc', '0e1990d494dc3c1a2f5fca4b56c35a4c', '4642cd486e01df18aa3d8468d43b2d9b', '6320163c8653234370204578f308e39f', '6f90ec5150be36c4475835b5941ab56f', 'b139fbdcd53f8036eb05d3023fc97580', '604f317cbd59b4de7b91326f61626d90', 'a6a1951929248912bcccea06be552ea2', '760337c825700b7db6d44a71c06753c8', 'a5afc2fc6cbdb37998250ef5c42eab15', 'e347a8d0bfec0255258cc913ed28d3a9', '644f6a10e96a8d7888d94d2062d485b5', '594fdb8a3914a91415413b3a467fa862', '6b8adddd6900dade3fc39d42df662316', '1d4e969b43ef0e67167ec4d418830125', 'acf599349b44e4e51d2beacb549313ad', 'cf3ca218511c73ca4a1c6a19e71f02d4', '0ecf7f65b5ff3b9e61b637e59f495e0a', '4a6959f9a4aca95bb3733032d0a50c9e', 'c8befcc96d8aeb7181c618aa56d9d624', '0636d30c77f0f9cfad81f1c9b58c791f', '0313291a6f4f16df04dcf819d88c38ef', '753bc5d6efa9e49a03e34cf521a9e124', 'b4e9fd0c998da3801378993e637076b9', '3b2aa495b3557acad84b380301a9cc94', 'e59e80fa326ba8d2e055c001d4053ca3', 'db890d2b3c871e8e4298f21bc8a7736b', 'f99053da7416fd6d626451cd49e44e0f', '9a7d84a4238838d9573057c632139951', 'd6cbe5bf7d8f5583902fd66f1b7a2f44', '30a38716bb2d04f12bb813aa2f926270', '0659f2d6cabae94afd4d009c6d0e07dc', '90fda9374b3c767ab5f2ed071d2a69bd', 'd09bda44c78bc7eee6ab6a5814d73461', 'e37af39d93f352f64ede45a151161d0c', '6a2da481aa7827b951175772a0fe8bb8', '59a7bb642795ddc85534fbd5f7360334', '1ab766d3d553649d7d8c7e2bb60f5c9d', 'f176923a0a4ab546c7287791ccb82193', 'b2639226e3ba85c1e40abf59a40d5b66', 'f9af3bfab584769f34c314b222d34263', '0a02ba4243b1b0e048a3841d5758d113', 'e629cd3ff381ad0a234016fa7dd76f46', '840048bff3cbd6d92d97a3816521222e', '2dd087e327b9bdff4f2ac6cf855a3b06', '8f7036a46f0fba0b1caea70c6ca36203', '90cef361fed2e841d4a2ca888590b952', '247c8264f852b34cd3124aec69332a9e', '7390ed59fa1febbfda31a80b4318c8cb', '123ecbb4fb560cff85dd1072dd0a613d', '8aa802da0f525a68e7fae5e67929ebdd', 'ad7f0d20b46f7386fa4509cbd19c095d', 'cb77bc826e0061d5f6e1702be7b87370', 'a9530b74fe56ca4be9e5f2c7e7c4175a', 'c6b482cd350f5dc9a48ec4c1292c72bd', '14359ea0c7a105749c0a56478825b015', '827e0258c06b6c4bfecec9e54bc04960', '4b753f7e58c2f8129715691ba3957fda', '3c752a5f42170574e38358384ece8819', 'c1b7a862b80f4b3c397d70ec4b1df7c5', '0922f37485310929b1b94e8f0c984ca5', '2ad10f49ef1f061d5f6739c89d3ade68', '1151be76113653dc4ed5d7e7fd17763a', 'f706755df4abce09ff6fcdfadbe76166', '88bc832f104b6dd684a17dbb1b76e778', '6259b1f4da26c67879164ebce6333481', '3d92572ad56193b186bf331850e189f5', 'faae4bdf680651e40005ea6f0f90e8b8', '9b0f710c65ae9320f0acd6dfccc1b4d6', '5d2db05a565eda438cf5069b7be6b1bb', '1881f0742867dc75d6e0bcc24f866710', '746c7bded4ed70487bf7d6461e9f855a', 'f3c581f21aff6af44eae11dc363a6562', '1d6bf991d51a595e57cf95d8bc861a15', '7c2a1e9b8bdcf8c544175b2e93c70a4e', '8886115442775dd8a20c2dcc921c7cc8', 'ff68ad2883e793597f1c343dd6505f5d', 'af46f006f8c0e0d06975480af3805422', '16c362fd9bd8fd63d3e70f7f94b640a1', 'b39c8c284dd7875b68a31449852cc76a', '41836ee9834e5c216b28ff121f5d8741', '0ac6c084c9623a0e2ace60297ad24d6c', '2bf93b9956e26356b52f5d426c7801de', '1f98e3467d2b48dfb44fc12b137d3447', '8fedeab003cb5fbd2bb599ccb940c49d', 'b40f9f5a8ddb36c2b9dded8fc1f5a255', '1fe76010d06f4c73ed78f3a7d469d93e', '55b9662183a72ec77b8f467ff8d42fa3', 'a34c42acb1860ba63a4d59e9295ea75a', '2108073aab31d087b1043c32fd9fdc47', 'a45afad7fcd7b8fa4fae020780a05b55', 'fdaa290acb9eeacb66fa7f979baa6803', '4be873e3bf0455763882101d7eccc78c', '6b4f7aec68e9574e9383d72653ae0ea1', '257c88e7a3ace7aa1a5eb5677f86b5b0', '7c6157bc4c2bae550d37b6fc445db0d5', 'f79fe58e24e35fe4c036e1fb462769de', '7965955995b477fd7c304f4caf2c6d26', '823c47d4abda1f8ce7568145f76c2b85', '89ff26f9d000b4a6d0f6e8c513f1cf08', '4543480cbf99692641f5b35eeae7947f', '202fad4a680401564558b2cda0c5a8de', '81f76d52e6e182a6b3358d20a50d888d', '958df1746c3131f9c7359ae5fb848497', '2ad9e4ed11e15401527b7e30080ce45f', 'cd5a5843d35eebdf90368bf24d4a04cf', '7a176e5d88c7acb6000a055e008c055d', 'b18b6d40c3c5510727e0f22b568d3e1f', '517a3d8518447c776efd430b66470e27', '08da95f931937b2c20f5225f2e6c93b0', 'e0998bd9b9ddc78c925d8a815f18673a', '415ee7f7e65b172382f321e711057114', '01f156677184504063bd19739f924af1', 'de6150bc32a048990e6cbb6dbd3825bb', '86f04460dd37a57e3a0ecdeaffb1919b', 'cb1bc069e25d9c59773c85a2e2a46713', '8ca747eb93e7b8d6a56532b8f074d185', '8de8484141a728d73763be267b73cda2', '75b181fe0ef694c753c72540c0fdd389', 'c2154b6ca788852f644e6f56dc7abbe9', '0eb1ee9dba87f5b36b4613a65074337c', '2b18ce67b0eecc6b8f099bfdaabf545b', 'abcee124c52c041bafc381cbc0036d24', '78712b57b1b1c000eec8434d73a7e37e', '30fceee7cb0dbe6926f2b1128f2b3648', '45f8e065815894a5aab5e89b61dd7b52', '31463a6b3a01f53c3ef1a04a849decf4', 'f922896769e9517ea3c630f3c8de86d0', '5f7e36fb1d42d0b1c408b5b41bc52bb3', '80b62bae3408ac888ded81a3ee24dabb', '4f3865fa2a981933ea86fc691d97f643', '0b3dc7efaafb0cf78a4796d42fa8d74c', '4e23e1826902ec9f208e8cc61329b494', 'ccafc1c3f270410521c3c6f3b249870f', '3532616d4171dc2202de2ae9233bb0c5', '2774c87659bdca5ddfc6df1a688a54e5', '81e5fcd4b5436bb7e3eefd166f7bc489', '8d3a54507421dbd2ce0a1d58046826e0', '3dbb0fae1525f2917412d3a890987f7e', '221234182f828fb9f2a4ba9c89874ede', 'c31f4549e788bd1e0afbd0cc8d64c671', '33faaac58c27088e3a8a8afd6058a6de', '215c4eb12924f9cc3c6cbd9a83fd5f69', 'a1b3368fc2fbaee34a19608fc1eb4a4b', 'ba8e8db5ac53ffa4aca56c6a645b9e05', '7e6f4c38a581cced287444d779368aed', 'a280408a71fb62ddc8ba902d4f86d681', 'b1a17b1877beaabde6421f631c95a734', '619b93ec47d01fd1228afef1a980abe3', 'b8b8726af116a5cfb35b0315ecef9172', 'ff8557b296729eaa30f078d974c86812', '32ea3bdedab835c3aa6cb68ce66565ef', '0ecf7f65b5ff3b9e61b637e59f495e0a', '5fc74f8947fe099b44a20bde30661d83', 'd93a22bb52bb2f41a7267b8a02370781', 'f7ace36f8a10c4b2ede9ea7708e01454', 'd291f3a15290c4c1edb70dc04fa8f35c', 'dccb0c4954538b15d8c4fa1fb69456aa', 'cc61b7d08f418647cd5c7766e1f2f9fa', '6011fbc9e8e5dad3b09334f73b8ef23f', 'a311db8053cf14527828dd6d89a1f57a', '15896826233a6eda72c64e1b99a0856a', '880e01d45fe9fdf7f417f37e55e7730f', '171d92af5bf025428dfc2fd869eb8a04', 'f7b62c75467e8ce080b201667cbbc274', '3e36312a56bdb312242490706752940b', 'df2988ba3ed226b10521a0e4da849b61', 'd748ffd406ceef24252c8879c1b27f96', 'ffb973f2bb1c0cb807a99341a9b20dcd', '61db744d2f835035a5625b59350c6b63', 'a36dfd7f887f03e4818a77b11784236e', '40b41ade17bb2121b576b07e25ce622b', 'd0f63e7f9adb09e4efc7f4f276b977e4', '1c5aefec1a2943edd15e52f9e2900e41', '84717d840af87a7fa12a66a0933ede2a', '11a40baf46bd96289d1f0936dae9a256']
orders_for_cohorts = orders_for_cohorts[~orders_for_cohorts['customer_unique_id'].isin(orders_2016)]
'''таких покупателей оказалось всего семь человек'''
orders_for_cohorts
| order_purchase_timestamp | customer_unique_id | order_id | month_year | |
|---|---|---|---|---|
| 0 | 2017-10-02 10:56:33 | 7c396fd4830fd04220f754e42b4e5bff | e481f51cbdc54678b7cc49136f2d6af7 | 2017-10 |
| 3 | 2017-11-18 19:28:06 | 7c142cf63193a1473d2e66489a9ae977 | 949d5b44dbf5de918fe9c16f97b45f8a | 2017-11 |
| 4 | 2018-02-13 21:18:39 | 72632f0f9dd73dfee390c9b22eb56dd6 | ad21c59c0840e6cb83a9ceb5573f8159 | 2018-02 |
| 5 | 2017-07-09 21:57:05 | 80bb27c7c16e8f973207a5086ab329e2 | a4591c265e18cb1dcee52889e2d8acc3 | 2017-07 |
| 6 | 2017-04-11 12:22:08 | 36edbb3fb164b1f16485364b6fb04c73 | 136cce7faa42fdb2cefd53fdc79a6098 | 2017-04 |
| ... | ... | ... | ... | ... |
| 99267 | 2017-03-09 09:54:05 | 6359f309b166b0196dbf7ad2ac62bb5a | 9c5dedf39a927c1b2549525ed64a053c | 2017-03 |
| 99268 | 2018-02-06 12:58:58 | da62f9e57a76d978d02ab5362c509660 | 63943bddc261676b46f01ca7ac2f7bd8 | 2018-02 |
| 99269 | 2017-08-27 14:46:43 | 737520a9aad80b3fbbdad19b66b37b30 | 83c1379a015df1e13d02aae0204711ab | 2017-08 |
| 99270 | 2018-01-08 21:28:27 | 5097a5312c8b157bb7be58ae360ef43c | 11c177c8e97725db2631073c19f07b62 | 2018-01 |
| 99271 | 2018-03-08 20:57:30 | 60350aa974b26ff12caad89e55993bd6 | 66dea50a8b16d9b4dee7af250b4be1a5 | 2018-03 |
66219 rows × 4 columns
'''создадим датафрейм с месяцем первой покупки для каждого покупателя и присоединим эти данные к исходной таблице'''
first_purchase = pd.DataFrame(orders_for_cohorts.groupby('customer_unique_id').month_year.min()).reset_index()
first_purchase = first_purchase.rename(columns = {'month_year': 'first_purchase'})
orders_for_cohorts = orders_for_cohorts.merge(first_purchase, how = 'left', on = 'customer_unique_id')
'''создадим дополнительную колонку для подсчета общего числа заказов и пока заполним ее нулями'''
orders_for_cohorts['total_orders'] = 0
orders_for_cohorts
| order_purchase_timestamp | customer_unique_id | order_id | month_year | first_purchase | total_orders | |
|---|---|---|---|---|---|---|
| 0 | 2017-10-02 10:56:33 | 7c396fd4830fd04220f754e42b4e5bff | e481f51cbdc54678b7cc49136f2d6af7 | 2017-10 | 2017-09 | 0 |
| 1 | 2017-11-18 19:28:06 | 7c142cf63193a1473d2e66489a9ae977 | 949d5b44dbf5de918fe9c16f97b45f8a | 2017-11 | 2017-11 | 0 |
| 2 | 2018-02-13 21:18:39 | 72632f0f9dd73dfee390c9b22eb56dd6 | ad21c59c0840e6cb83a9ceb5573f8159 | 2018-02 | 2018-02 | 0 |
| 3 | 2017-07-09 21:57:05 | 80bb27c7c16e8f973207a5086ab329e2 | a4591c265e18cb1dcee52889e2d8acc3 | 2017-07 | 2017-07 | 0 |
| 4 | 2017-04-11 12:22:08 | 36edbb3fb164b1f16485364b6fb04c73 | 136cce7faa42fdb2cefd53fdc79a6098 | 2017-04 | 2017-04 | 0 |
| ... | ... | ... | ... | ... | ... | ... |
| 66214 | 2017-03-09 09:54:05 | 6359f309b166b0196dbf7ad2ac62bb5a | 9c5dedf39a927c1b2549525ed64a053c | 2017-03 | 2017-03 | 0 |
| 66215 | 2018-02-06 12:58:58 | da62f9e57a76d978d02ab5362c509660 | 63943bddc261676b46f01ca7ac2f7bd8 | 2018-02 | 2018-02 | 0 |
| 66216 | 2017-08-27 14:46:43 | 737520a9aad80b3fbbdad19b66b37b30 | 83c1379a015df1e13d02aae0204711ab | 2017-08 | 2017-08 | 0 |
| 66217 | 2018-01-08 21:28:27 | 5097a5312c8b157bb7be58ae360ef43c | 11c177c8e97725db2631073c19f07b62 | 2018-01 | 2018-01 | 0 |
| 66218 | 2018-03-08 20:57:30 | 60350aa974b26ff12caad89e55993bd6 | 66dea50a8b16d9b4dee7af250b4be1a5 | 2018-03 | 2018-03 | 0 |
66219 rows × 6 columns
'''теперь разобьем покупателей на когорты. Признак - месяц первой покупки'''
grouped_orders_for_cohorts = orders_for_cohorts.groupby(['first_purchase', 'month_year']) \
.agg({'customer_unique_id':pd.Series.nunique, 'total_orders':'count'}) \
.rename(columns = {'customer_unique_id': 'total_users'})
grouped_orders_for_cohorts
| total_users | total_orders | ||
|---|---|---|---|
| first_purchase | month_year | ||
| 2017-01 | 2017-01 | 762 | 796 |
| 2017-02 | 3 | 3 | |
| 2017-03 | 2 | 2 | |
| 2017-04 | 1 | 1 | |
| 2017-05 | 3 | 3 | |
| ... | ... | ... | ... |
| 2018-01 | 2018-02 | 24 | 24 |
| 2018-03 | 27 | 27 | |
| 2018-02 | 2018-02 | 6448 | 6598 |
| 2018-03 | 25 | 25 | |
| 2018-03 | 2018-03 | 6965 | 7054 |
119 rows × 2 columns
'''теперь внутри каждой когорты присвоим номера месяцам покупок (возьмем месяц первой покупки за 0)'''
def cohort_month_number(df):
df['cohort_month_number'] = np.arange(0, len(df))
return df
grouped_orders_for_cohorts = grouped_orders_for_cohorts.groupby(level = 0).apply(cohort_month_number)
grouped_orders_for_cohorts.head(20)
| total_users | total_orders | cohort_month_number | ||
|---|---|---|---|---|
| first_purchase | month_year | |||
| 2017-01 | 2017-01 | 762 | 796 | 0 |
| 2017-02 | 3 | 3 | 1 | |
| 2017-03 | 2 | 2 | 2 | |
| 2017-04 | 1 | 1 | 3 | |
| 2017-05 | 3 | 3 | 4 | |
| 2017-06 | 1 | 1 | 5 | |
| 2017-07 | 4 | 4 | 6 | |
| 2017-08 | 1 | 1 | 7 | |
| 2017-09 | 1 | 1 | 8 | |
| 2017-11 | 3 | 3 | 9 | |
| 2017-12 | 1 | 1 | 10 | |
| 2018-01 | 6 | 6 | 11 | |
| 2018-02 | 3 | 3 | 12 | |
| 2018-03 | 1 | 1 | 13 | |
| 2017-02 | 2017-02 | 1738 | 1763 | 0 |
| 2017-03 | 4 | 4 | 1 | |
| 2017-04 | 5 | 5 | 2 | |
| 2017-05 | 2 | 2 | 3 | |
| 2017-06 | 7 | 7 | 4 | |
| 2017-07 | 2 | 2 | 5 |
grouped_orders_for_cohorts.reset_index(inplace = True)
grouped_orders_for_cohorts.set_index(['cohort_month_number', 'first_purchase'], inplace = True)
grouped_orders_for_cohorts
| month_year | total_users | total_orders | ||
|---|---|---|---|---|
| cohort_month_number | first_purchase | |||
| 0 | 2017-01 | 2017-01 | 762 | 796 |
| 1 | 2017-01 | 2017-02 | 3 | 3 |
| 2 | 2017-01 | 2017-03 | 2 | 2 |
| 3 | 2017-01 | 2017-04 | 1 | 1 |
| 4 | 2017-01 | 2017-05 | 3 | 3 |
| ... | ... | ... | ... | ... |
| 1 | 2018-01 | 2018-02 | 24 | 24 |
| 2 | 2018-01 | 2018-03 | 27 | 27 |
| 0 | 2018-02 | 2018-02 | 6448 | 6598 |
| 1 | 2018-02 | 2018-03 | 25 | 25 |
| 0 | 2018-03 | 2018-03 | 6965 | 7054 |
119 rows × 3 columns
'''посчитаем размеры каждой когорты'''
cohort_group_size = grouped_orders_for_cohorts['total_users'].groupby(level = 1).first()
cohort_group_size.head()
first_purchase 2017-01 762 2017-02 1738 2017-03 2634 2017-04 2348 2017-05 3589 Freq: M, Name: total_users, dtype: int64
grouped_orders_for_cohorts['total_users'].unstack(0)
| cohort_month_number | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| first_purchase | ||||||||||||||
| 2017-01 | 762.0 | 3.0 | 2.0 | 1.0 | 3.0 | 1.0 | 4.0 | 1.0 | 1.0 | 3.0 | 1.0 | 6.0 | 3.0 | 1.0 |
| 2017-02 | 1738.0 | 4.0 | 5.0 | 2.0 | 7.0 | 2.0 | 4.0 | 3.0 | 2.0 | 4.0 | 2.0 | 5.0 | 3.0 | 3.0 |
| 2017-03 | 2634.0 | 13.0 | 9.0 | 10.0 | 9.0 | 4.0 | 4.0 | 8.0 | 9.0 | 2.0 | 10.0 | 4.0 | 6.0 | NaN |
| 2017-04 | 2348.0 | 14.0 | 5.0 | 4.0 | 8.0 | 6.0 | 8.0 | 7.0 | 7.0 | 4.0 | 6.0 | 2.0 | NaN | NaN |
| 2017-05 | 3589.0 | 17.0 | 18.0 | 14.0 | 11.0 | 12.0 | 15.0 | 6.0 | 9.0 | 11.0 | 9.0 | NaN | NaN | NaN |
| 2017-06 | 3138.0 | 15.0 | 11.0 | 13.0 | 8.0 | 12.0 | 12.0 | 7.0 | 4.0 | 7.0 | NaN | NaN | NaN | NaN |
| 2017-07 | 3889.0 | 20.0 | 14.0 | 10.0 | 11.0 | 8.0 | 12.0 | 4.0 | 7.0 | NaN | NaN | NaN | NaN | NaN |
| 2017-08 | 4181.0 | 28.0 | 14.0 | 11.0 | 15.0 | 22.0 | 12.0 | 11.0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 2017-09 | 4126.0 | 28.0 | 22.0 | 12.0 | 19.0 | 9.0 | 9.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2017-10 | 4467.0 | 31.0 | 11.0 | 4.0 | 10.0 | 9.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2017-11 | 7296.0 | 40.0 | 28.0 | 13.0 | 14.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2017-12 | 5482.0 | 14.0 | 15.0 | 19.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2018-01 | 7024.0 | 24.0 | 27.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2018-02 | 6448.0 | 25.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2018-03 | 6965.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
user_retention = grouped_orders_for_cohorts['total_users'].unstack(0).divide(cohort_group_size, axis = 0)
user_retention
| cohort_month_number | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| first_purchase | ||||||||||||||
| 2017-01 | 1.0 | 0.003937 | 0.002625 | 0.001312 | 0.003937 | 0.001312 | 0.005249 | 0.001312 | 0.001312 | 0.003937 | 0.001312 | 0.007874 | 0.003937 | 0.001312 |
| 2017-02 | 1.0 | 0.002301 | 0.002877 | 0.001151 | 0.004028 | 0.001151 | 0.002301 | 0.001726 | 0.001151 | 0.002301 | 0.001151 | 0.002877 | 0.001726 | 0.001726 |
| 2017-03 | 1.0 | 0.004935 | 0.003417 | 0.003797 | 0.003417 | 0.001519 | 0.001519 | 0.003037 | 0.003417 | 0.000759 | 0.003797 | 0.001519 | 0.002278 | NaN |
| 2017-04 | 1.0 | 0.005963 | 0.002129 | 0.001704 | 0.003407 | 0.002555 | 0.003407 | 0.002981 | 0.002981 | 0.001704 | 0.002555 | 0.000852 | NaN | NaN |
| 2017-05 | 1.0 | 0.004737 | 0.005015 | 0.003901 | 0.003065 | 0.003344 | 0.004179 | 0.001672 | 0.002508 | 0.003065 | 0.002508 | NaN | NaN | NaN |
| 2017-06 | 1.0 | 0.004780 | 0.003505 | 0.004143 | 0.002549 | 0.003824 | 0.003824 | 0.002231 | 0.001275 | 0.002231 | NaN | NaN | NaN | NaN |
| 2017-07 | 1.0 | 0.005143 | 0.003600 | 0.002571 | 0.002828 | 0.002057 | 0.003086 | 0.001029 | 0.001800 | NaN | NaN | NaN | NaN | NaN |
| 2017-08 | 1.0 | 0.006697 | 0.003348 | 0.002631 | 0.003588 | 0.005262 | 0.002870 | 0.002631 | NaN | NaN | NaN | NaN | NaN | NaN |
| 2017-09 | 1.0 | 0.006786 | 0.005332 | 0.002908 | 0.004605 | 0.002181 | 0.002181 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2017-10 | 1.0 | 0.006940 | 0.002463 | 0.000895 | 0.002239 | 0.002015 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2017-11 | 1.0 | 0.005482 | 0.003838 | 0.001782 | 0.001919 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2017-12 | 1.0 | 0.002554 | 0.002736 | 0.003466 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2018-01 | 1.0 | 0.003417 | 0.003844 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2018-02 | 1.0 | 0.003877 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2018-03 | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
plt.figure(figsize = (24,16))
plt.title('Cohorts: User Retention')
sns.heatmap(user_retention, mask = user_retention.isnull(), annot = True, cmap = 'YlGnBu', fmt = '.2%')
<AxesSubplot:title={'center':'Cohorts: User Retention'}, xlabel='cohort_month_number', ylabel='first_purchase'>
goods_df.query("order_id=='00143d0f86d6fbd9f9b38ab440ac16f5'")
| order_id | order_item_id | product_id | seller_id | shipping_limit_date | price | freight_value | |
|---|---|---|---|---|---|---|---|
| 32 | 00143d0f86d6fbd9f9b38ab440ac16f5 | 1 | e95ee6822b66ac6058e2e4aff656071a | a17f621c590ea0fab3d5d883e1630ec6 | 2017-10-20 16:07:52 | 21.33 | 15.1 |
| 33 | 00143d0f86d6fbd9f9b38ab440ac16f5 | 2 | e95ee6822b66ac6058e2e4aff656071a | a17f621c590ea0fab3d5d883e1630ec6 | 2017-10-20 16:07:52 | 21.33 | 15.1 |
| 34 | 00143d0f86d6fbd9f9b38ab440ac16f5 | 3 | e95ee6822b66ac6058e2e4aff656071a | a17f621c590ea0fab3d5d883e1630ec6 | 2017-10-20 16:07:52 | 21.33 | 15.1 |
'''посчитаем сумму каждого заказа'''
goods_df_sum = goods_df.groupby('order_id', as_index = False)['price'].sum()
goods_df_sum.rename(columns={'price':'order_sum'}, inplace = True)
'''а теперь возьмем датафрейм из первого задания с заказами, которые мы определили как "покупки" и присоединим к ним
суммы'''
orders_with_sum = orders_customers_final.merge(goods_df_sum, how = 'left', on = 'order_id')
'''оставим только нужные столбцы'''
orders_with_sum = orders_with_sum[['order_id', 'customer_unique_id', 'order_purchase_timestamp','order_status', 'order_sum']]
orders_with_sum
| order_id | customer_unique_id | order_purchase_timestamp | order_status | order_sum | |
|---|---|---|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 7c396fd4830fd04220f754e42b4e5bff | 2017-10-02 10:56:33 | delivered | 29.99 |
| 1 | 53cdb2fc8bc7dce0b6741e2150273451 | af07308b275d755c9edb36a90c618231 | 2018-07-24 20:41:37 | delivered | 118.70 |
| 2 | 47770eb9100c2d0c44946d9cf07ec65d | 3a653a41f6f9fc3d2a113cf8398680e8 | 2018-08-08 08:38:49 | delivered | 159.90 |
| 3 | 949d5b44dbf5de918fe9c16f97b45f8a | 7c142cf63193a1473d2e66489a9ae977 | 2017-11-18 19:28:06 | delivered | 45.00 |
| 4 | ad21c59c0840e6cb83a9ceb5573f8159 | 72632f0f9dd73dfee390c9b22eb56dd6 | 2018-02-13 21:18:39 | delivered | 19.90 |
| ... | ... | ... | ... | ... | ... |
| 99267 | 9c5dedf39a927c1b2549525ed64a053c | 6359f309b166b0196dbf7ad2ac62bb5a | 2017-03-09 09:54:05 | delivered | 72.00 |
| 99268 | 63943bddc261676b46f01ca7ac2f7bd8 | da62f9e57a76d978d02ab5362c509660 | 2018-02-06 12:58:58 | delivered | 174.90 |
| 99269 | 83c1379a015df1e13d02aae0204711ab | 737520a9aad80b3fbbdad19b66b37b30 | 2017-08-27 14:46:43 | delivered | 205.99 |
| 99270 | 11c177c8e97725db2631073c19f07b62 | 5097a5312c8b157bb7be58ae360ef43c | 2018-01-08 21:28:27 | delivered | 359.98 |
| 99271 | 66dea50a8b16d9b4dee7af250b4be1a5 | 60350aa974b26ff12caad89e55993bd6 | 2018-03-08 20:57:30 | delivered | 68.50 |
99272 rows × 5 columns
'''для части заказов нет сумм'''
orders_with_sum.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 99272 entries, 0 to 99271 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 order_id 99272 non-null object 1 customer_unique_id 99272 non-null object 2 order_purchase_timestamp 99272 non-null datetime64[ns] 3 order_status 99272 non-null object 4 order_sum 98643 non-null float64 dtypes: datetime64[ns](1), float64(1), object(3) memory usage: 4.5+ MB
'''посмотрим подробнее. В основном это недоступные и отмененные заказы. Но есть два с выставленым счетом и один в процессе
доставки. Возможно, проблема со сбором данных'''
orders_with_sum[orders_with_sum['order_sum'].isna()].order_status.value_counts()
unavailable 603 canceled 23 invoiced 2 shipped 1 Name: order_status, dtype: int64
'''создадим список id заказов, для которых нет сумм'''
orders_without_sum = orders_with_sum.fillna(0) \
.query('order_sum == 0') \
.order_id.to_list()
'''и удалим их из датафрейма'''
orders_with_sum = orders_with_sum[~orders_with_sum['order_id'].isin(orders_without_sum)]
orders_with_sum
| order_id | customer_unique_id | order_purchase_timestamp | order_status | order_sum | |
|---|---|---|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 7c396fd4830fd04220f754e42b4e5bff | 2017-10-02 10:56:33 | delivered | 29.99 |
| 1 | 53cdb2fc8bc7dce0b6741e2150273451 | af07308b275d755c9edb36a90c618231 | 2018-07-24 20:41:37 | delivered | 118.70 |
| 2 | 47770eb9100c2d0c44946d9cf07ec65d | 3a653a41f6f9fc3d2a113cf8398680e8 | 2018-08-08 08:38:49 | delivered | 159.90 |
| 3 | 949d5b44dbf5de918fe9c16f97b45f8a | 7c142cf63193a1473d2e66489a9ae977 | 2017-11-18 19:28:06 | delivered | 45.00 |
| 4 | ad21c59c0840e6cb83a9ceb5573f8159 | 72632f0f9dd73dfee390c9b22eb56dd6 | 2018-02-13 21:18:39 | delivered | 19.90 |
| ... | ... | ... | ... | ... | ... |
| 99267 | 9c5dedf39a927c1b2549525ed64a053c | 6359f309b166b0196dbf7ad2ac62bb5a | 2017-03-09 09:54:05 | delivered | 72.00 |
| 99268 | 63943bddc261676b46f01ca7ac2f7bd8 | da62f9e57a76d978d02ab5362c509660 | 2018-02-06 12:58:58 | delivered | 174.90 |
| 99269 | 83c1379a015df1e13d02aae0204711ab | 737520a9aad80b3fbbdad19b66b37b30 | 2017-08-27 14:46:43 | delivered | 205.99 |
| 99270 | 11c177c8e97725db2631073c19f07b62 | 5097a5312c8b157bb7be58ae360ef43c | 2018-01-08 21:28:27 | delivered | 359.98 |
| 99271 | 66dea50a8b16d9b4dee7af250b4be1a5 | 60350aa974b26ff12caad89e55993bd6 | 2018-03-08 20:57:30 | delivered | 68.50 |
98643 rows × 5 columns
'''последняя дата заказа у нас 2018-09-03, предположим, что анализ мы проводим на следующий день, то есть 4 сентября 2018'''
orders_with_sum.order_purchase_timestamp.max()
Timestamp('2018-09-03 09:06:57')
'''создадим новую колонку с числом дней, прошедших с каждого заказа'''
orders_with_sum['DaysSinceOrder'] = orders_with_sum['order_purchase_timestamp'].apply(lambda x: (pd.to_datetime('2018-09-04') - x).days)
C:\Users\79307\anaconda3\envs\python_3_7\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
'''и оставим только те покупки, которые состоялись в течение года (то есть с 4 сентября 2017, так как периодом для анализа
возьмем последний год)'''
orders_with_sum = orders_with_sum.query('DaysSinceOrder < 365')
orders_with_sum
| order_id | customer_unique_id | order_purchase_timestamp | order_status | order_sum | DaysSinceOrder | |
|---|---|---|---|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 7c396fd4830fd04220f754e42b4e5bff | 2017-10-02 10:56:33 | delivered | 29.99 | 336 |
| 1 | 53cdb2fc8bc7dce0b6741e2150273451 | af07308b275d755c9edb36a90c618231 | 2018-07-24 20:41:37 | delivered | 118.70 | 41 |
| 2 | 47770eb9100c2d0c44946d9cf07ec65d | 3a653a41f6f9fc3d2a113cf8398680e8 | 2018-08-08 08:38:49 | delivered | 159.90 | 26 |
| 3 | 949d5b44dbf5de918fe9c16f97b45f8a | 7c142cf63193a1473d2e66489a9ae977 | 2017-11-18 19:28:06 | delivered | 45.00 | 289 |
| 4 | ad21c59c0840e6cb83a9ceb5573f8159 | 72632f0f9dd73dfee390c9b22eb56dd6 | 2018-02-13 21:18:39 | delivered | 19.90 | 202 |
| ... | ... | ... | ... | ... | ... | ... |
| 99263 | cfa78b997e329a5295b4ee6972c02979 | a49e8e11e850592fe685ae3c64b40eca | 2017-12-20 09:52:41 | delivered | 55.90 | 257 |
| 99264 | 9115830be804184b91f5c00f6f49f92d | c716cf2b5b86fb24257cffe9e7969df8 | 2017-10-04 19:57:37 | delivered | 69.01 | 334 |
| 99268 | 63943bddc261676b46f01ca7ac2f7bd8 | da62f9e57a76d978d02ab5362c509660 | 2018-02-06 12:58:58 | delivered | 174.90 | 209 |
| 99270 | 11c177c8e97725db2631073c19f07b62 | 5097a5312c8b157bb7be58ae360ef43c | 2018-01-08 21:28:27 | delivered | 359.98 | 238 |
| 99271 | 66dea50a8b16d9b4dee7af250b4be1a5 | 60350aa974b26ff12caad89e55993bd6 | 2018-03-08 20:57:30 | delivered | 68.50 | 179 |
75266 rows × 6 columns
'''сформируем наш итоговый датафрейм с показателями давности, частоты и суммы платежей для каждого покупателя'''
customer_rfm = orders_with_sum.groupby('customer_unique_id', as_index = False) \
.agg({'DaysSinceOrder': 'min', 'order_id': 'count', 'order_sum':'sum'}) \
.rename(columns = {'DaysSinceOrder':'Recency', 'order_id':'Frequency', 'order_sum':'Monetary'})
customer_rfm
| customer_unique_id | Recency | Frequency | Monetary | |
|---|---|---|---|---|
| 0 | 0000366f3b9a7992bf8c76cfdf3221e2 | 116 | 1 | 129.90 |
| 1 | 0000b849f77a49e4a4ce2b2a4ca5be3f | 119 | 1 | 18.90 |
| 2 | 0000f6ccb0745a6a4b88665a16c9f078 | 326 | 1 | 25.99 |
| 3 | 0004aac84e0df4da2b147fca70cf8255 | 293 | 1 | 180.00 |
| 4 | 0004bd2a26a76fe21f786e4fbd80607f | 151 | 1 | 154.00 |
| ... | ... | ... | ... | ... |
| 73239 | fffb09418989a0dbff854a28163e47c6 | 260 | 1 | 58.00 |
| 73240 | fffbf87b7a1a6fa8b03f081c5f51a201 | 250 | 1 | 149.00 |
| 73241 | fffcc512b7dfecaffd80f13614af1d16 | 145 | 1 | 688.00 |
| 73242 | fffea47cd6d3cc0a88bd621562a9d061 | 267 | 1 | 64.89 |
| 73243 | ffff5962728ec6157033ef9805bacc48 | 124 | 1 | 115.00 |
73244 rows × 4 columns
quintiles = customer_rfm[['Recency', 'Frequency', 'Monetary']].quantile([.2, .4, .6, .8]).to_dict()
quintiles
{'Recency': {0.2: 75.0, 0.4: 143.0, 0.6: 207.0, 0.8: 279.0},
'Frequency': {0.2: 1.0, 0.4: 1.0, 0.6: 1.0, 0.8: 1.0},
'Monetary': {0.2: 39.9, 0.4: 69.9, 0.6: 109.99, 0.8: 179.8}}
customer_rfm.Frequency.value_counts()
1 71398 2 1722 3 98 4 16 5 5 7 3 9 1 11 1 Name: Frequency, dtype: int64
def r_score(x):
if x <= quintiles['Recency'][.2]:
return 5
elif x <= quintiles['Recency'][.4]:
return 4
elif x <= quintiles['Recency'][.6]:
return 3
elif x <= quintiles['Recency'][.8]:
return 2
else:
return 1
def m_score(x):
if x <= quintiles['Monetary'][.2]:
return 1
elif x <= quintiles['Monetary'][.4]:
return 2
elif x <= quintiles['Monetary'][.6]:
return 3
elif x <= quintiles['Monetary'][.8]:
return 4
else:
return 5
def f_score(x):
if x == 1:
return 1
elif x == 2:
return 2
elif x == 3 or x == 4:
return 3
elif x == 5 or x == 7:
return 4
else:
return 5
'''присваиваем ранг каждому показателю'''
customer_rfm['R'] = customer_rfm['Recency'].apply(lambda x: r_score(x))
customer_rfm['F'] = customer_rfm['Frequency'].apply(lambda x: f_score(x))
customer_rfm['M'] = customer_rfm['Monetary'].apply(lambda x: m_score(x))
customer_rfm
| customer_unique_id | Recency | Frequency | Monetary | R | F | M | |
|---|---|---|---|---|---|---|---|
| 0 | 0000366f3b9a7992bf8c76cfdf3221e2 | 116 | 1 | 129.90 | 4 | 1 | 4 |
| 1 | 0000b849f77a49e4a4ce2b2a4ca5be3f | 119 | 1 | 18.90 | 4 | 1 | 1 |
| 2 | 0000f6ccb0745a6a4b88665a16c9f078 | 326 | 1 | 25.99 | 1 | 1 | 1 |
| 3 | 0004aac84e0df4da2b147fca70cf8255 | 293 | 1 | 180.00 | 1 | 1 | 5 |
| 4 | 0004bd2a26a76fe21f786e4fbd80607f | 151 | 1 | 154.00 | 3 | 1 | 4 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 73239 | fffb09418989a0dbff854a28163e47c6 | 260 | 1 | 58.00 | 2 | 1 | 2 |
| 73240 | fffbf87b7a1a6fa8b03f081c5f51a201 | 250 | 1 | 149.00 | 2 | 1 | 4 |
| 73241 | fffcc512b7dfecaffd80f13614af1d16 | 145 | 1 | 688.00 | 3 | 1 | 5 |
| 73242 | fffea47cd6d3cc0a88bd621562a9d061 | 267 | 1 | 64.89 | 2 | 1 | 2 |
| 73243 | ffff5962728ec6157033ef9805bacc48 | 124 | 1 | 115.00 | 4 | 1 | 4 |
73244 rows × 7 columns
customer_rfm['RFM_Score'] = customer_rfm['R'].map(str) + customer_rfm['F'].map(str) + customer_rfm['M'].map(str)
customer_rfm
| customer_unique_id | Recency | Frequency | Monetary | R | F | M | RFM_Score | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0000366f3b9a7992bf8c76cfdf3221e2 | 116 | 1 | 129.90 | 4 | 1 | 4 | 414 |
| 1 | 0000b849f77a49e4a4ce2b2a4ca5be3f | 119 | 1 | 18.90 | 4 | 1 | 1 | 411 |
| 2 | 0000f6ccb0745a6a4b88665a16c9f078 | 326 | 1 | 25.99 | 1 | 1 | 1 | 111 |
| 3 | 0004aac84e0df4da2b147fca70cf8255 | 293 | 1 | 180.00 | 1 | 1 | 5 | 115 |
| 4 | 0004bd2a26a76fe21f786e4fbd80607f | 151 | 1 | 154.00 | 3 | 1 | 4 | 314 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 73239 | fffb09418989a0dbff854a28163e47c6 | 260 | 1 | 58.00 | 2 | 1 | 2 | 212 |
| 73240 | fffbf87b7a1a6fa8b03f081c5f51a201 | 250 | 1 | 149.00 | 2 | 1 | 4 | 214 |
| 73241 | fffcc512b7dfecaffd80f13614af1d16 | 145 | 1 | 688.00 | 3 | 1 | 5 | 315 |
| 73242 | fffea47cd6d3cc0a88bd621562a9d061 | 267 | 1 | 64.89 | 2 | 1 | 2 | 212 |
| 73243 | ffff5962728ec6157033ef9805bacc48 | 124 | 1 | 115.00 | 4 | 1 | 4 | 414 |
73244 rows × 8 columns
'''у нас получилась 71 различная вариация RFM'''
customer_rfm.RFM_Score.unique()
array(['414', '411', '111', '115', '314', '315', '313', '113', '312',
'412', '211', '512', '511', '515', '114', '213', '513', '514',
'311', '215', '112', '415', '214', '413', '212', '324', '423',
'425', '525', '225', '322', '224', '524', '122', '435', '521',
'325', '522', '124', '323', '523', '123', '422', '424', '223',
'421', '125', '222', '134', '321', '534', '434', '345', '333',
'335', '535', '545', '121', '355', '235', '221', '135', '334',
'533', '555', '233', '445', '234', '245', '532', '433'],
dtype=object)
segments_for_matrix = {'1': ['Потерянные', 'Потерянные', 'Потерянные', 'Потерянные', 'Потерянные', 'Потерянные','Потерянные','Потерянные','Потерянные',],
'2': ['В зоне риска', 'В зоне риска', 'В зоне риска', 'В зоне риска', 'Нельзя потерять', 'Нельзя потерять', 'Нельзя потерять', 'Нельзя потерять', 'Нельзя потерять'],
'3': ['Нуждаются во внимании', 'Нуждаются во внимании', 'Нуждаются во внимании', 'Нуждаются во внимании', 'Высокий потенциал', 'Высокий потенциал', 'Высокий потенциал', 'Высокий потенциал', 'Высокий потенциал'],
'4': ['Новички', 'Новички', 'Новички', 'Лояльные', 'Лояльные', 'Лояльные', 'Чемпионы', 'Чемпионы', 'Чемпионы'],
'5': ['Новички', 'Новички', 'Новички', 'Лояльные', 'Лояльные', 'Лояльные', 'Чемпионы', 'Чемпионы', 'Чемпионы']
}
matrix = pd.DataFrame(segments_for_matrix, index = ['2', '3', '4', '5', '6', '7', '8', '9', '10'])
col_dict = {'Потерянные': 'green', 'В зоне риска': 'red', 'Нельзя потерять': 'blue', 'Нуждаются во внимании': 'gray', 'Высокий потенциал': 'orange', 'Новички' : 'yellow', 'Лояльные': 'pink', 'Чемпионы': 'violet'}
def colour_cell(val):
if val in col_dict:
return 'Background-color: %s' % col_dict[val]
return ''
matrix = matrix.style.applymap(colour_cell)
matrix
| 1 | 2 | 3 | 4 | 5 | |
|---|---|---|---|---|---|
| 2 | Потерянные | В зоне риска | Нуждаются во внимании | Новички | Новички |
| 3 | Потерянные | В зоне риска | Нуждаются во внимании | Новички | Новички |
| 4 | Потерянные | В зоне риска | Нуждаются во внимании | Новички | Новички |
| 5 | Потерянные | В зоне риска | Нуждаются во внимании | Лояльные | Лояльные |
| 6 | Потерянные | Нельзя потерять | Высокий потенциал | Лояльные | Лояльные |
| 7 | Потерянные | Нельзя потерять | Высокий потенциал | Лояльные | Лояльные |
| 8 | Потерянные | Нельзя потерять | Высокий потенциал | Чемпионы | Чемпионы |
| 9 | Потерянные | Нельзя потерять | Высокий потенциал | Чемпионы | Чемпионы |
| 10 | Потерянные | Нельзя потерять | Высокий потенциал | Чемпионы | Чемпионы |
def rfm_level(df):
if df['R'] == 1:
return 'Потерянные'
elif df['R'] == 2:
if df['F'] + df['M'] < 6:
return 'В зоне риска'
else:
return 'Нельзя потерять'
elif df['R'] == 3:
if df['F'] + df['M'] < 6:
return 'Нуждаются во внимании'
else:
return 'Высокий потенциал'
elif df['R'] == 4 or df['R'] == 5:
if df['F'] + df['M'] < 5:
return 'Новички'
elif df['F'] + df['M'] > 4 and df['F'] + df['M'] < 8:
return 'Лояльные'
else:
return 'Чемпионы'
customer_rfm['RFM_level'] = customer_rfm.apply(rfm_level, axis=1)
customer_rfm
| customer_unique_id | Recency | Frequency | Monetary | R | F | M | RFM_Score | RFM_level | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 0000366f3b9a7992bf8c76cfdf3221e2 | 116 | 1 | 129.90 | 4 | 1 | 4 | 414 | Лояльные |
| 1 | 0000b849f77a49e4a4ce2b2a4ca5be3f | 119 | 1 | 18.90 | 4 | 1 | 1 | 411 | Новички |
| 2 | 0000f6ccb0745a6a4b88665a16c9f078 | 326 | 1 | 25.99 | 1 | 1 | 1 | 111 | Потерянные |
| 3 | 0004aac84e0df4da2b147fca70cf8255 | 293 | 1 | 180.00 | 1 | 1 | 5 | 115 | Потерянные |
| 4 | 0004bd2a26a76fe21f786e4fbd80607f | 151 | 1 | 154.00 | 3 | 1 | 4 | 314 | Нуждаются во внимании |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 73239 | fffb09418989a0dbff854a28163e47c6 | 260 | 1 | 58.00 | 2 | 1 | 2 | 212 | В зоне риска |
| 73240 | fffbf87b7a1a6fa8b03f081c5f51a201 | 250 | 1 | 149.00 | 2 | 1 | 4 | 214 | В зоне риска |
| 73241 | fffcc512b7dfecaffd80f13614af1d16 | 145 | 1 | 688.00 | 3 | 1 | 5 | 315 | Высокий потенциал |
| 73242 | fffea47cd6d3cc0a88bd621562a9d061 | 267 | 1 | 64.89 | 2 | 1 | 2 | 212 | В зоне риска |
| 73243 | ffff5962728ec6157033ef9805bacc48 | 124 | 1 | 115.00 | 4 | 1 | 4 | 414 | Лояльные |
73244 rows × 9 columns
plt.figure(figsize = (16, 10))
ax = sns.countplot(customer_rfm.RFM_level)
C:\Users\79307\anaconda3\envs\python_3_7\lib\site-packages\seaborn\_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.